Enable CPU device on SGLang (#2806)

This commit is contained in:
Chunyuan WU
2025-01-17 13:22:53 +08:00
committed by GitHub
parent a8ccacc8b8
commit 63051738a9
13 changed files with 376 additions and 9 deletions

View File

@@ -49,6 +49,7 @@ from sglang.srt.layers.quantization.fp8_utils import (
normalize_e4m3fn_to_e4m3fnuz,
)
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.layers.rotary_embedding import get_rope_wrapper
from sglang.srt.layers.vocab_parallel_embedding import (
ParallelLMHead,
VocabParallelEmbedding,
@@ -271,13 +272,14 @@ class DeepseekV2Attention(nn.Module):
quant_config=quant_config,
)
rope_scaling["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(
self.rotary_emb = get_rope_wrapper(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
is_neox_style=False,
device=global_server_args_dict["device"],
)
if rope_scaling: