[Feat] flashcomm2+oshard Generalized (#4723)

### What this PR does / why we need it? [FlashComm2](https://gitcode.com/ascend-tribe/ascend-inference-cluster/blob/main/FlashComm/FlashComm2%E5%A4%A7%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86%E4%B8%AD%E4%BB%A5%E5%AD%98%E6%8D%A2%E4%BC%A0%E7%9A%84%E9%80%9A%E4%BF%A1%E4%BC%98%E5%8C%96%E6%8A%80%E6%9C%AF.pdf) introduces redundant storage of the o_proj matrix, which imposes pressure on GPU memory. We propose the FlashComm2+Oshard approach by integrating the shared linear layer feature (#2931). This approach distributes weights layer-by-layer to each GPU and accesses the o_proj of each layer via asynchronous broadcast operations, thereby alleviating memory pressure while achieving nearly lossless performance compared to the original FlashComm2. This PR implements a generalized FlashComm2+Oshard solution. Using following env to support flashcomm2 with oshard ```shell export VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE=1 --additional-config '{ "layer_sharding": ["o_proj"] }' ``` ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Levi-JQ <yujinqi2@huawei.com> Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2026-01-10 22:57:57 +08:00
parent aa987ffe87
commit ecd4232698
5 changed files with 179 additions and 1 deletions
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -157,6 +157,29 @@ def test_qwen3_moe_fc2_tp2() -> None:
        vllm_model.generate(example_prompts, sampling_params)


+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
+@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
+def test_qwen3_moe_fc2_oshard_tp2() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(
+            snapshot_download("Qwen/Qwen3-30B-A3B"),
+            dtype="auto",
+            tensor_parallel_size=2,
+            distributed_executor_backend="mp",
+            enable_expert_parallel=True,
+            enforce_eager=
+            True,  # TODO(Levi-JQ): support graph mode for fc2 in Qwen 
+            additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
 def test_deepseek_v2_lite_fc1_tp2() -> None:
    example_prompts = [