diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 7c7242ef..243dede6 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -233,11 +233,12 @@ def select_moe_comm_method(num_tokens: int, vllm_config: VllmConfig, is_draft_mo if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group().world_size == 1: moe_comm_type = MoECommType.ALLGATHER elif soc_version in {AscendDeviceType.A2}: - if ( - num_tokens <= mc2_tokens_capacity - and vllm_config.parallel_config.world_size_across_dp / vllm_config.parallel_config.pipeline_parallel_size - >= 16 - ): + num_experts = vllm_config.model_config.get_num_experts() + ep_world_size = ( + vllm_config.parallel_config.world_size_across_dp // vllm_config.parallel_config.pipeline_parallel_size + ) + num_experts_per_device = num_experts // ep_world_size + if num_experts_per_device <= 24 and ep_world_size >= 16 and num_tokens <= mc2_tokens_capacity: moe_comm_type = MoECommType.MC2 else: moe_comm_type = MoECommType.ALLGATHER diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py index ea82322b..f4de1323 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py @@ -1493,6 +1493,9 @@ class MooncakeLayerwiseConnectorWorker: ) -> None: """MooncakeLayerwiseConnector does not save explicitly.""" if self.vllm_config.kv_transfer_config.is_kv_producer and connector_metadata.requests.keys(): + if self.current_layer >= self.total_layers: + self.current_layer += 1 + return # get reshape and cache event if layer_name == "": layer_name = self.index_to_name[self.current_layer][0] diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 8fc168ac..f2ab3074 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -831,6 +831,7 @@ class NPUModelRunner(GPUModelRunner): num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) # For chunked prefills, use -1 as mask rather than 0, as guided # decoding may rollback speculative tokens. + new_schedule_reqs = [x.req_id for x in scheduler_output.scheduled_new_reqs] num_decode_draft_tokens = np.full(num_reqs, -1, dtype=np.int32) for ( req_id, @@ -838,13 +839,12 @@ class NPUModelRunner(GPUModelRunner): ) in scheduler_output.scheduled_spec_decode_tokens.items(): req_idx = self.input_batch.req_id_to_index[req_id] num_draft_tokens[req_idx] = len(draft_token_ids) - num_decode_draft_tokens[req_idx] = ( - len(draft_token_ids) - if ( - self.input_batch.num_computed_tokens_cpu[req_idx] >= self.input_batch.num_prompt_tokens[req_idx] - ) - else -1 - ) + if (self.is_kv_consumer and req_id in new_schedule_reqs) or \ + (self.input_batch.num_computed_tokens_cpu[req_idx] >= \ + self.input_batch.num_prompt_tokens[req_idx]): + num_decode_draft_tokens[req_idx] = len(draft_token_ids) + else: + num_decode_draft_tokens[req_idx] = -1 spec_decode_metadata = self._calc_spec_decode_metadata( num_draft_tokens,