diff --git a/python/sglang/srt/models/deepseek_nextn.py b/python/sglang/srt/models/deepseek_nextn.py index b77ad0c9a..01ee187ef 100644 --- a/python/sglang/srt/models/deepseek_nextn.py +++ b/python/sglang/srt/models/deepseek_nextn.py @@ -94,7 +94,9 @@ class DeepseekModelNextN(nn.Module): zero_allocator = BumpAllocator( buffer_size=2, dtype=torch.float32, - device=input_ids.device, + device=( + input_embeds.device if input_embeds is not None else input_ids.device + ), ) if input_embeds is None: diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 625df4642..b3bd49173 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -1374,7 +1374,9 @@ class DeepseekV2Model(nn.Module): # TODO for two-batch-overlap, we need a larger buffer size buffer_size=len(self.layers) * 2, dtype=torch.float32, - device=input_ids.device, + device=( + input_embeds.device if input_embeds is not None else input_ids.device + ), ) if input_embeds is None: