[Feat]Xlite Qwen3 MoE Support Data Parallel (#6715)
### What this PR does / why we need it?
This patch adds support for the Qwen3-MoE data parallel in Xlite. For
more details about Xlite, please refer to the following
link:[https://atomgit.com/openeuler/GVirt/blob/master/xlite/README.md](https://atomgit.com/openeuler/GVirt/blob/master/xlite/README.md).
online server config:
```shell
port=$1
log=$2
export VLLM_USE_V1=1
export TASK_QUEUE_ENABLE=1
export HCCL_BUFFSIZE=512
export HCCL_OP_EXPANSION_MODE="AIV"
export OMP_PROC_BIND=false
export VLLM_ASCEND_ENABLE_NZ=0
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl kernel.sched_migration_cost_ns=50000
ip=127.0.0.1
python -m vllm.entrypoints.openai.api_server \
--model /mnt/nvme1n1/wy/models/Qwen3-30B-A3B \
--tensor-parallel-size 2 \
--enable-expert-parallel \
--data-parallel-size 4 \
--gpu-memory-utilization 0.9 \
--max-num-batched-tokens 32768 \
--data-parallel-size-local 4 \
--max-num-seqs=200 \
--block-size 128 \
--max-model-len 6656 \
--trust-remote-code \
--disable-log-requests \
--served-model-name qwen \
--no-enable-prefix-caching \
--additional-config '{"xlite_graph_config": {"enabled": true, "full_mode": true}, "enable_cpu_binding": true}' \
--compilation-config '{"cudagraph_capture_sizes":[1, 16, 32, 48, 64, 100, 150, 200], "cudagraph_mode": "FULL_DECODE_ONLY"}' \
--async-scheduling \
--host ${ip} \
--port ${port} > ${log} 2>&1 &
```
test_config:
```shell
vllm bench serve \
--max-concurrency ${maxconcurrency} \
--num-prompts ${num_prompts} \
--host ${HOST} \
--port ${PORT} \
--model ${MODEL_NAME} \
--dataset-name random \
--backend openai-chat \
--random-input-len 512 \
--random-output-len 512 \
--random-range-ratio 0.2 \
--temperature 0.6 \
--metric-percentiles "50,90,99" \
--tokenizer ${TOKENIZER_PATH} \
--endpoint /v1/chat/completions \
--ignore-eos
```
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- vLLM version: v0.16.0
- vLLM main:
c86cdcbcd2
Signed-off-by: uuzWY <Ethan.wangyuan@huawei.com>
Co-authored-by: uuzWY <Ethan.wangyuan@huawei.com>
This commit is contained in:
@@ -21,5 +21,5 @@ pytest_mock
|
|||||||
msserviceprofiler>=1.2.2
|
msserviceprofiler>=1.2.2
|
||||||
mindstudio-probe>=8.3.0
|
mindstudio-probe>=8.3.0
|
||||||
arctic-inference==0.1.1
|
arctic-inference==0.1.1
|
||||||
xlite==0.1.0rc1
|
xlite==0.1.0rc3
|
||||||
uc-manager
|
uc-manager
|
||||||
|
|||||||
@@ -224,11 +224,15 @@ class NPUPlatform(Platform):
|
|||||||
|
|
||||||
from vllm.config.compilation import CUDAGraphMode
|
from vllm.config.compilation import CUDAGraphMode
|
||||||
|
|
||||||
if ascend_config.xlite_graph_config.enabled and ascend_config.xlite_graph_config.full_mode:
|
if ascend_config.xlite_graph_config.enabled:
|
||||||
|
if ascend_config.xlite_graph_config.full_mode:
|
||||||
logger.info("ACLGraph is disabled under xlite full mode")
|
logger.info("ACLGraph is disabled under xlite full mode")
|
||||||
enforce_eager = True
|
enforce_eager = True
|
||||||
model_config.enforce_eager = True
|
model_config.enforce_eager = True
|
||||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||||
|
else:
|
||||||
|
logger.info("Falling back to FULL_DECODE_ONLY under xlite decode-only mode")
|
||||||
|
compilation_config.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||||
|
|
||||||
if enforce_eager:
|
if enforce_eager:
|
||||||
logger.info("Compilation disabled, using eager mode by default")
|
logger.info("Compilation disabled, using eager mode by default")
|
||||||
|
|||||||
@@ -2169,6 +2169,20 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
spec_decode_common_attn_metadata = spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
|
spec_decode_common_attn_metadata = spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
|
||||||
return attn_metadata, spec_decode_common_attn_metadata
|
return attn_metadata, spec_decode_common_attn_metadata
|
||||||
|
|
||||||
|
def _should_build_dummy_attn_metadata(
|
||||||
|
self,
|
||||||
|
force_attention: bool = False,
|
||||||
|
is_profile: bool = False,
|
||||||
|
cudagraph_runtime_mode: CUDAGraphMode | None = None,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Determine whether attention metadata should be built during dummy_run.
|
||||||
|
SubClass can override this to add custom conditions.
|
||||||
|
"""
|
||||||
|
# If force_attention is True, we always capture attention, Otherwise,
|
||||||
|
# it only happens for cudagraph_runtime_mode=FULL.
|
||||||
|
return force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def _dummy_run(
|
def _dummy_run(
|
||||||
self,
|
self,
|
||||||
@@ -2272,9 +2286,8 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# vllm-ascend does not support ubatch now
|
# vllm-ascend does not support ubatch now
|
||||||
ubatch_slices, ubatch_slices_padded = None, None
|
ubatch_slices, ubatch_slices_padded = None, None
|
||||||
attn_metadata: PerLayerAttnMetadata | None = None
|
attn_metadata: PerLayerAttnMetadata | None = None
|
||||||
# If force_attention is True, we always capture attention. Otherwise,
|
# Build attention metadata for dummy_run
|
||||||
# it only happens for cudagraph_runtime_mode=FULL.
|
if self._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode):
|
||||||
if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
|
||||||
if create_mixed_batch:
|
if create_mixed_batch:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it"
|
"create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it"
|
||||||
|
|||||||
@@ -157,9 +157,6 @@ class LlamaXliteModel(XliteModel):
|
|||||||
|
|
||||||
class QwenMoeXliteModel(LlamaXliteModel):
|
class QwenMoeXliteModel(LlamaXliteModel):
|
||||||
def initialize(self, runnable: nn.Module, vllm_config: VllmConfig) -> tuple[Model, int, int, torch.dtype]:
|
def initialize(self, runnable: nn.Module, vllm_config: VllmConfig) -> tuple[Model, int, int, torch.dtype]:
|
||||||
if envs_ascend.VLLM_ASCEND_ENABLE_NZ == 2:
|
|
||||||
architecture = vllm_config.model_config.architectures[0]
|
|
||||||
raise ValueError(f"{architecture} not support VLLM_ASCEND_ENABLE_NZ = 2!")
|
|
||||||
dtype = vllm_config.model_config.dtype
|
dtype = vllm_config.model_config.dtype
|
||||||
config = self._build_model_config(vllm_config)
|
config = self._build_model_config(vllm_config)
|
||||||
xlite_model = self._build_model(runnable, vllm_config, config)
|
xlite_model = self._build_model(runnable, vllm_config, config)
|
||||||
@@ -174,7 +171,6 @@ class QwenMoeXliteModel(LlamaXliteModel):
|
|||||||
config = super()._build_model_config(vllm_config)
|
config = super()._build_model_config(vllm_config)
|
||||||
hf_config = vllm_config.model_config.hf_text_config
|
hf_config = vllm_config.model_config.hf_text_config
|
||||||
ep_group = get_ep_group()
|
ep_group = get_ep_group()
|
||||||
config.n_layers = hf_config.max_window_layers
|
|
||||||
config.n_dense_layers = 0
|
config.n_dense_layers = 0
|
||||||
config.n_routed_experts = hf_config.num_experts
|
config.n_routed_experts = hf_config.num_experts
|
||||||
config.n_shared_experts = 0
|
config.n_shared_experts = 0
|
||||||
@@ -229,9 +225,8 @@ class XliteWrapper:
|
|||||||
|
|
||||||
rank = torch.distributed.get_rank()
|
rank = torch.distributed.get_rank()
|
||||||
local_rank = get_world_group().local_rank
|
local_rank = get_world_group().local_rank
|
||||||
self.xlite_rt = Runtime(
|
self.data_parallel_size = vllm_config.parallel_config.data_parallel_size
|
||||||
local_rank, 0, rank, get_tensor_model_parallel_world_size(), vllm_config.parallel_config.data_parallel_size
|
self.xlite_rt = Runtime(local_rank, 0, rank, get_tensor_model_parallel_world_size(), self.data_parallel_size)
|
||||||
)
|
|
||||||
|
|
||||||
(self.xlite_model, self.freq_cis, hidden_size, dtype) = xlite_model_init(runnable, vllm_config)
|
(self.xlite_model, self.freq_cis, hidden_size, dtype) = xlite_model_init(runnable, vllm_config)
|
||||||
|
|
||||||
@@ -278,7 +273,16 @@ class XliteWrapper:
|
|||||||
AscendAttentionState.SpecDecoding,
|
AscendAttentionState.SpecDecoding,
|
||||||
]
|
]
|
||||||
|
|
||||||
if not with_prefill or self.full_mode:
|
# Full: graph for prefill and decode
|
||||||
|
# Decode-Only: runnable for prefill, graph for decode
|
||||||
|
if not self.full_mode and self.data_parallel_size > 1:
|
||||||
|
num_tokens = forward_context.batch_descriptor.num_tokens
|
||||||
|
num_reqs = forward_context.batch_descriptor.num_reqs
|
||||||
|
use_xlite_graph = num_reqs is not None and num_tokens <= num_reqs
|
||||||
|
else:
|
||||||
|
use_xlite_graph = not with_prefill or self.full_mode
|
||||||
|
|
||||||
|
if use_xlite_graph:
|
||||||
# TODO: When vllm_ascend enables graph mode, attn_metadata.num_decodes
|
# TODO: When vllm_ascend enables graph mode, attn_metadata.num_decodes
|
||||||
# will be padded in decode requests. Therefore, it is first fixed using
|
# will be padded in decode requests. Therefore, it is first fixed using
|
||||||
# num_decode_tokens. However, in the future, when MTP is enabled, there
|
# num_decode_tokens. However, in the future, when MTP is enabled, there
|
||||||
@@ -299,7 +303,10 @@ class XliteWrapper:
|
|||||||
xlite_attn_metadata.is_prefills = [False] * num_decodes + [True] * num_prefills
|
xlite_attn_metadata.is_prefills = [False] * num_decodes + [True] * num_prefills
|
||||||
xlite_attn_metadata.block_tables = attn_metadata.block_tables.cpu().tolist()
|
xlite_attn_metadata.block_tables = attn_metadata.block_tables.cpu().tolist()
|
||||||
|
|
||||||
h = self.hidden_states[: attn_metadata.num_actual_tokens]
|
# Compatibility between DP and Non-DP scenarios
|
||||||
|
num_tokens = forward_context.batch_descriptor.num_tokens
|
||||||
|
num_actual_tokens = attn_metadata.num_actual_tokens
|
||||||
|
h = self.hidden_states[:num_tokens]
|
||||||
stream = torch.npu.current_stream().npu_stream
|
stream = torch.npu.current_stream().npu_stream
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
self.xlite_model.forward(
|
self.xlite_model.forward(
|
||||||
@@ -309,6 +316,6 @@ class XliteWrapper:
|
|||||||
self.xlite_model.forward_with_inputs_embeds(
|
self.xlite_model.forward_with_inputs_embeds(
|
||||||
self.xlite_rt, inputs_embeds, xlite_attn_metadata, self.kv_caches, self.freq_cis, h, stream
|
self.xlite_rt, inputs_embeds, xlite_attn_metadata, self.kv_caches, self.freq_cis, h, stream
|
||||||
)
|
)
|
||||||
return h
|
return h[:num_actual_tokens]
|
||||||
else:
|
else:
|
||||||
return self.runnable(input_ids, positions, intermediate_tensors, inputs_embeds)
|
return self.runnable(input_ids, positions, intermediate_tensors, inputs_embeds)
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
|
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
|
||||||
# isort: skip_file
|
# isort: skip_file
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from vllm.config import CUDAGraphMode
|
||||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||||
|
|
||||||
@@ -34,3 +35,18 @@ class XliteModelRunner(NPUModelRunner):
|
|||||||
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
||||||
super().initialize_kv_cache(kv_cache_config)
|
super().initialize_kv_cache(kv_cache_config)
|
||||||
self.model.register_kv_caches(self.kv_caches)
|
self.model.register_kv_caches(self.kv_caches)
|
||||||
|
|
||||||
|
def _should_build_dummy_attn_metadata(
|
||||||
|
self,
|
||||||
|
force_attention: bool = False,
|
||||||
|
is_profile: bool = False,
|
||||||
|
cudagraph_runtime_mode: CUDAGraphMode | None = None,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Override to build attention metadata during dummy_run when xlite is enable.
|
||||||
|
For xlite, we need to build metadata during DP dummy_run to ensure all ranks
|
||||||
|
have consistent metadata, even when some ranks have no requests.
|
||||||
|
"""
|
||||||
|
base_condition = super()._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode)
|
||||||
|
xlite_condition = self.ascend_config.xlite_graph_config.enabled and not is_profile
|
||||||
|
return base_condition or xlite_condition
|
||||||
|
|||||||
Reference in New Issue
Block a user