[refactor] refactor deepseek-related files (#2849)

### What this PR does / why we need it? This PR deletes ~2K lines of code about deepseek modeling. It falls back CustomDeepseekV2 modules to original vllm implementations and adapts some modifications in vllm about deepseek and moe. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? E2E vllm serving with torchair graph mode and eager mode. - vLLM version: v0.10.2 - vLLM main: 759ef49b15 --------- Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-09-16 14:13:07 +08:00
parent 18ca7861f6
commit 1c5900327b
18 changed files with 295 additions and 1899 deletions
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -14,14 +14,24 @@ def test_e2e_ep_correctness(model_name):
    ]
    max_tokens = 5

-    with VllmRunner(model_name, tensor_parallel_size=2,
-                    enforce_eager=True) as vllm_model:
+    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=2,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True
+            }},
+            enforce_eager=True) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(model_name,
-                    tensor_parallel_size=2,
-                    enable_expert_parallel=True,
-                    enforce_eager=True) as vllm_model:
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True
+            }},
+            enforce_eager=True) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(