[Bugfix] Correct method call for _set_cos_sin_cache (#774)
This change ensures proper functionality for longer sequences by correctly invoking the _set_cos_sin_cache method with self as the first argument. For example, with DeepSeek R1, if this change isn't made, the program will crash when the input sequence exceeds 4096. Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
This commit is contained in:
@@ -82,7 +82,7 @@ def native_rope_deepseek_forward(self,
|
|||||||
offsets: Optional[torch.Tensor] = None,
|
offsets: Optional[torch.Tensor] = None,
|
||||||
max_seq_len: Optional[int] = None):
|
max_seq_len: Optional[int] = None):
|
||||||
if max_seq_len is not None and max_seq_len > self.max_seq_len:
|
if max_seq_len is not None and max_seq_len > self.max_seq_len:
|
||||||
self._set_cos_sin_cache(max_seq_len, query.device, query.dtype)
|
_set_cos_sin_cache(self, max_seq_len, query.device, query.dtype)
|
||||||
if len(key.shape) == 2:
|
if len(key.shape) == 2:
|
||||||
key = key[:, None, :]
|
key = key[:, None, :]
|
||||||
# Note: we implement the non neox_style method with shuffle the last dim and neox style
|
# Note: we implement the non neox_style method with shuffle the last dim and neox style
|
||||||
|
|||||||
Reference in New Issue
Block a user