[main][refactor] Refactoring forward_context and model_runner_v1 (#1979)

### What this PR does / why we need it? A refactoring of forward_context and model_runner_v1, add some context which is necessary in model inference into forward_context, and refactor dummy_run logic, make it more reasonable. Some details for this PR: Add `ascend_forward_context`; Update mc2_v2 op, and support `active_mask` param; Update scripts in examples dir; refactor `dummy_run` logic; Add soc_version for A2 and A3; ### Does this PR introduce _any_ user-facing change? No change at user-facing. ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: 57c22e57f9 Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-07-28 14:06:20 +08:00
parent e3a2443c3a
commit ba3dfbd59e
22 changed files with 629 additions and 347 deletions
--- a/examples/offline_dualbatch_overlap_npu.py
+++ b/examples/offline_dualbatch_overlap_npu.py
@@ -21,6 +21,7 @@ def main():
              tensor_parallel_size=2,
              max_model_len=4096,
              trust_remote_code=True,
+              enable_expert_parallel=True,
              additional_config={
                  "torchair_graph_config": {
                      "enabled": False
@@ -28,7 +29,6 @@ def main():
                  "ascend_scheduler_config": {
                      "enabled": True
                  },
-                  "expert_tensor_parallel_size": 1
              })

    # Generate texts from the prompts. The output is a list of RequestOutput
--- a/examples/run_dp_server.sh
+++ b/examples/run_dp_server.sh
@@ -1,3 +1,7 @@
+rm -rf ./.torchair_cache/
+rm -rf ./dynamo_*
+rm -rf /root/ascend/log/debug/plog/*
+
 export HCCL_IF_IP=2.0.0.0
 export GLOO_SOCKET_IFNAME="enp189s0f0"
 export TP_SOCKET_IFNAME="enp189s0f0"
@@ -6,25 +10,24 @@ export HCCL_SOCKET_IFNAME="enp189s0f0"
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100

-export VLLM_USE_V1=0
-
-export ASCEND_RT_VISIBLE_DEVICES=0,1
-export VLLM_DP_SIZE=2
-export VLLM_DP_RANK=0
-export VLLM_DP_MASTER_IP="2.0.0.0"
-export VLLM_DP_MASTER_PORT=40001
-export VLLM_DP_PROXY_IP="2.0.0.0"
-export VLLM_DP_PROXY_PORT=30002
-export VLLM_DP_MONITOR_PORT=30003
-export VLLM_HTTP_PORT=20001
+export VLLM_USE_V1=1
+export ASCEND_LAUNCH_BLOCKING=0

 vllm serve /data/weights/Qwen2.5-0.5B-Instruct \
    --host 0.0.0.0 \
-    --port 20001 \
-    --tensor-parallel-size 1 \
-    --seed 1024 \
+    --port 20002 \
    --served-model-name Qwen \
-    --max-model-len 2000 \
-    --max-num-batched-tokens 2000 \
+    --data-parallel-size 4 \
+    --data-parallel-size-local 4 \
+    --data-parallel-address 2.0.0.0 \
+    --data-parallel-rpc-port 13389 \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --no-enable-prefix-caching \
+    --max-num-seqs 16 \
+    --max-model-len 4096 \
+    --max-num-batched-tokens 4096 \
+    --gpu-memory-utilization 0.9 \
    --trust-remote-code \
-    --gpu-memory-utilization 0.9 \
+    --enforce-eager \
+    --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}'