diff --git a/requirements-dev.txt b/requirements-dev.txt index abc2ca09..e6193c50 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,5 +21,5 @@ pytest_mock msserviceprofiler>=1.2.2 mindstudio-probe>=8.3.0 arctic-inference==0.1.1 -xlite==0.1.0rc1 +xlite==0.1.0rc3 uc-manager diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 2d161a42..78046e3a 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -224,11 +224,15 @@ class NPUPlatform(Platform): from vllm.config.compilation import CUDAGraphMode - if ascend_config.xlite_graph_config.enabled and ascend_config.xlite_graph_config.full_mode: - logger.info("ACLGraph is disabled under xlite full mode") - enforce_eager = True - model_config.enforce_eager = True - compilation_config.cudagraph_mode = CUDAGraphMode.NONE + if ascend_config.xlite_graph_config.enabled: + if ascend_config.xlite_graph_config.full_mode: + logger.info("ACLGraph is disabled under xlite full mode") + enforce_eager = True + model_config.enforce_eager = True + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + else: + logger.info("Falling back to FULL_DECODE_ONLY under xlite decode-only mode") + compilation_config.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY if enforce_eager: logger.info("Compilation disabled, using eager mode by default") diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 51018023..a2e49373 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2169,6 +2169,20 @@ class NPUModelRunner(GPUModelRunner): spec_decode_common_attn_metadata = spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs) return attn_metadata, spec_decode_common_attn_metadata + def _should_build_dummy_attn_metadata( + self, + force_attention: bool = False, + is_profile: bool = False, + cudagraph_runtime_mode: CUDAGraphMode | None = None, + ) -> bool: + """ + Determine whether attention metadata should be built during dummy_run. + SubClass can override this to add custom conditions. + """ + # If force_attention is True, we always capture attention, Otherwise, + # it only happens for cudagraph_runtime_mode=FULL. + return force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL + @torch.inference_mode() def _dummy_run( self, @@ -2272,9 +2286,8 @@ class NPUModelRunner(GPUModelRunner): # vllm-ascend does not support ubatch now ubatch_slices, ubatch_slices_padded = None, None attn_metadata: PerLayerAttnMetadata | None = None - # If force_attention is True, we always capture attention. Otherwise, - # it only happens for cudagraph_runtime_mode=FULL. - if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL: + # Build attention metadata for dummy_run + if self._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode): if create_mixed_batch: raise NotImplementedError( "create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it" diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index 6a62b250..cd7423c9 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -157,9 +157,6 @@ class LlamaXliteModel(XliteModel): class QwenMoeXliteModel(LlamaXliteModel): def initialize(self, runnable: nn.Module, vllm_config: VllmConfig) -> tuple[Model, int, int, torch.dtype]: - if envs_ascend.VLLM_ASCEND_ENABLE_NZ == 2: - architecture = vllm_config.model_config.architectures[0] - raise ValueError(f"{architecture} not support VLLM_ASCEND_ENABLE_NZ = 2!") dtype = vllm_config.model_config.dtype config = self._build_model_config(vllm_config) xlite_model = self._build_model(runnable, vllm_config, config) @@ -174,7 +171,6 @@ class QwenMoeXliteModel(LlamaXliteModel): config = super()._build_model_config(vllm_config) hf_config = vllm_config.model_config.hf_text_config ep_group = get_ep_group() - config.n_layers = hf_config.max_window_layers config.n_dense_layers = 0 config.n_routed_experts = hf_config.num_experts config.n_shared_experts = 0 @@ -229,9 +225,8 @@ class XliteWrapper: rank = torch.distributed.get_rank() local_rank = get_world_group().local_rank - self.xlite_rt = Runtime( - local_rank, 0, rank, get_tensor_model_parallel_world_size(), vllm_config.parallel_config.data_parallel_size - ) + self.data_parallel_size = vllm_config.parallel_config.data_parallel_size + self.xlite_rt = Runtime(local_rank, 0, rank, get_tensor_model_parallel_world_size(), self.data_parallel_size) (self.xlite_model, self.freq_cis, hidden_size, dtype) = xlite_model_init(runnable, vllm_config) @@ -278,7 +273,16 @@ class XliteWrapper: AscendAttentionState.SpecDecoding, ] - if not with_prefill or self.full_mode: + # Full: graph for prefill and decode + # Decode-Only: runnable for prefill, graph for decode + if not self.full_mode and self.data_parallel_size > 1: + num_tokens = forward_context.batch_descriptor.num_tokens + num_reqs = forward_context.batch_descriptor.num_reqs + use_xlite_graph = num_reqs is not None and num_tokens <= num_reqs + else: + use_xlite_graph = not with_prefill or self.full_mode + + if use_xlite_graph: # TODO: When vllm_ascend enables graph mode, attn_metadata.num_decodes # will be padded in decode requests. Therefore, it is first fixed using # num_decode_tokens. However, in the future, when MTP is enabled, there @@ -299,7 +303,10 @@ class XliteWrapper: xlite_attn_metadata.is_prefills = [False] * num_decodes + [True] * num_prefills xlite_attn_metadata.block_tables = attn_metadata.block_tables.cpu().tolist() - h = self.hidden_states[: attn_metadata.num_actual_tokens] + # Compatibility between DP and Non-DP scenarios + num_tokens = forward_context.batch_descriptor.num_tokens + num_actual_tokens = attn_metadata.num_actual_tokens + h = self.hidden_states[:num_tokens] stream = torch.npu.current_stream().npu_stream if inputs_embeds is None: self.xlite_model.forward( @@ -309,6 +316,6 @@ class XliteWrapper: self.xlite_model.forward_with_inputs_embeds( self.xlite_rt, inputs_embeds, xlite_attn_metadata, self.kv_caches, self.freq_cis, h, stream ) - return h + return h[:num_actual_tokens] else: return self.runnable(input_ids, positions, intermediate_tensors, inputs_embeds) diff --git a/vllm_ascend/xlite/xlite_model_runner.py b/vllm_ascend/xlite/xlite_model_runner.py index 44fc7ed0..c7322bc2 100644 --- a/vllm_ascend/xlite/xlite_model_runner.py +++ b/vllm_ascend/xlite/xlite_model_runner.py @@ -17,6 +17,7 @@ # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py # isort: skip_file import torch.nn as nn +from vllm.config import CUDAGraphMode from vllm.v1.kv_cache_interface import KVCacheConfig from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -34,3 +35,18 @@ class XliteModelRunner(NPUModelRunner): def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: super().initialize_kv_cache(kv_cache_config) self.model.register_kv_caches(self.kv_caches) + + def _should_build_dummy_attn_metadata( + self, + force_attention: bool = False, + is_profile: bool = False, + cudagraph_runtime_mode: CUDAGraphMode | None = None, + ) -> bool: + """ + Override to build attention metadata during dummy_run when xlite is enable. + For xlite, we need to build metadata during DP dummy_run to ensure all ranks + have consistent metadata, even when some ranks have no requests. + """ + base_condition = super()._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode) + xlite_condition = self.ascend_config.xlite_graph_config.enabled and not is_profile + return base_condition or xlite_condition