diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index cbf5833b..8f14aa3d 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -170,9 +170,9 @@ class AscendSFAMetadataBuilder: input_positions = common_attn_metadata.positions[: num_input_tokens].long( ) - query_start_loc = common_attn_metadata.query_start_loc - query_lens = query_start_loc[1:] - query_start_loc[:-1] - has_prefill = any(query_lens > self.decode_threshold) + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + has_prefill = any(query_lens_cpu > self.decode_threshold) if self.cos_cache is None: self.cos_cache = model.model.layers[ diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 8ff325ff..caf3f601 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -233,7 +233,13 @@ class MtpProposer(Proposer): num_tokens_across_dp, with_prefill, ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill) - + if self.use_async_scheduling: + # there is synchronization between mtp steps when enabling aclgraph, + # disable aclgraph when use async scheduling to avoid the + # synchronization overhead. + # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run + # and _propose. + aclgraph_runtime_mode = CUDAGraphMode.NONE moe_comm_type = self.runner._select_moe_comm_method(num_tokens) # TODO: remove this after moe_comm_type selection logic is finalized moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type @@ -742,9 +748,11 @@ class MtpProposer(Proposer): aclgraph_runtime_mode, batch_descriptor = \ self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora) if self.use_async_scheduling: - # there is synchronize between mtp steps when enable aclgraph, + # there is synchronization between mtp steps when enabling aclgraph, # disable aclgraph when use async scheduling to avoid the - # synchronize overhead. + # synchronization overhead. + # NOTE: we need to set aclgraph_runtime_mode to None in both dummy_run + # and _propose. aclgraph_runtime_mode = CUDAGraphMode.NONE if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bfb9a510..705634db 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2923,9 +2923,10 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): cu_num_tokens, arange = self._get_cumsum_and_arange( num_scheduled_tokens) - self.query_start_loc[1:num_reqs + 1] = torch.Tensor(cu_num_tokens) self.query_start_loc_cpu[1:num_reqs + 1] = torch.Tensor(cu_num_tokens) + self.query_start_loc = self.query_start_loc_cpu.pin_memory().to( + self.device, non_blocking=True) self.query_lens = torch.from_numpy(num_scheduled_tokens) self.attn_mask = self.attn_mask_builder.get_splitfuse_attn_mask()