From 30e3d86b0f49c68352f24b4ac8da2988a2f1d7fc Mon Sep 17 00:00:00 2001 From: xuyexiong Date: Fri, 17 Oct 2025 09:42:48 +0800 Subject: [PATCH] Revert "[BUGFIX] Mtp torchair pd fix (#3449)" (#3500) This reverts commit b0ae203e72d87985314d583e211dddca6f351958. ### What this PR does / why we need it? The fix is not ready yet, conflict with #3411 need to revert first. Will fix this issue later ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: xuyexiong --- vllm_ascend/torchair/torchair_model_runner.py | 29 +++---------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py index e6ef03e..b29fdc2 100644 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -452,31 +452,10 @@ class NPUTorchairModelRunner(NPUModelRunner): self.torchair_graph_batch_sizes.append(self.max_num_reqs) # padded max number tokens = max_num_req * decode_token_per_req - if self.decode_token_per_req > 1: - # pd disaggregation scenario need redundant_batch_sizes to avoid each batch's seq_len exceed 16 tokens - if self.is_kv_consumer: - FIA_SEQ_LEN_LIMIT = 16 - self.torchair_graph_batch_sizes = [ - (graph_batch_size + - math.ceil(graph_batch_size / FIA_SEQ_LEN_LIMIT) + - math.ceil(graph_batch_size * self.decode_token_per_req / - FIA_SEQ_LEN_LIMIT / FIA_SEQ_LEN_LIMIT)) * - self.decode_token_per_req - for graph_batch_size in self.torchair_graph_batch_sizes - ] - new_max_num_reqs = math.ceil( - max(self.torchair_graph_batch_sizes) / - self.decode_token_per_req) - if self.max_num_reqs < new_max_num_reqs: - logger.warning( - f"max_num_reqs is updated to {new_max_num_reqs}") - self.max_num_reqs = new_max_num_reqs - self.scheduler_config.max_num_seqs = new_max_num_reqs - else: - self.torchair_graph_batch_sizes = [ - graph_batch_size * self.decode_token_per_req - for graph_batch_size in self.torchair_graph_batch_sizes - ] + self.torchair_graph_batch_sizes = [ + graph_batch_size * self.decode_token_per_req + for graph_batch_size in self.torchair_graph_batch_sizes + ] # NOTE: when enable_expert_parallel on A3, we need to check if `graph_batch_size` is divisible by `tp_size` # Because we use x_active_mask for dispatch/combine op on A3, which requires that input shape should be same