[MOE]move weight transpose to wakeup for RL secnarios (#4626)

### What this PR does / why we need it? In reinforcement learning scenarios, the current inference applies a transpose operation to the weights. For a cleaner architecture, the weight transpose module was moved to wakeup. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: lhp-deep <liuhaopeng1@huawei.com> Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-08 20:34:52 +08:00
parent 58db21f56a
commit b230e7e987
7 changed files with 132 additions and 120 deletions
--- a/tests/e2e/multicard/test_offline_weight_load.py
+++ b/tests/e2e/multicard/test_offline_weight_load.py
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Run `pytest tests/multicard/test_offline_load_weight.py`.
+"""
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+MODELS = ["Qwen/Qwen3-30B-A3B"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
+def test_offline_weight_load_and_sleepmode(model):
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    cmd = [
+        sys.executable,
+        str(script),
+        "--model",
+        model,
+        "--tp-size",
+        "2",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--proc-per-node",
+        "2",
+        "--trust-remote-code",
+        "--enable-sleep-mode",
+        "--temperature",
+        "0",
+        "--model-weight-gib",
+        "0.8",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=600,
+    )
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "Generated text:" in output
+    assert "Sleep and wake up successfully!!" in output
+    assert proc.returncode == 0
--- a/tests/ut/ops/test_fused_moe.py
+++ b/tests/ut/ops/test_fused_moe.py
@@ -25,8 +25,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
 from tests.ut.base import TestBase
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.ops.fused_moe.fused_moe import (
-    AscendFusedMoE, AscendUnquantizedFusedMoEMethod)
+from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
 from vllm_ascend.ops.fused_moe.moe_mlp import (cumsum_group_list,
                                               unified_apply_mlp)
 from vllm_ascend.utils import AscendDeviceType, adapt_patch
@@ -595,39 +594,3 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertTrue(mock_forward_context.with_quant)
        self.assertEqual(result.shape, hidden_states_shape)
        self.assertEqual(result.dtype, torch.bfloat16)
-
-
-class TestLoadWeight(TestBase):
-
-    def test_load_w13_transpose(self):
-        with patch.object(AscendFusedMoE, "__init__",
-                          lambda self, *args, **kwargs: None):
-            moe = AscendFusedMoE(num_experts=4, top_k=2, hidden_size=8)
-
-            expert_data = torch.randn(128, 8)
-            loaded_weight = torch.randn(128, 4)
-            moe._load_w13(expert_data, 1, "w1", loaded_weight, 0)
-
-            expert_data = torch.randn(8, 128)
-            loaded_weight = torch.randn(128, 4)
-            moe._load_w13(expert_data, 1, "w1", loaded_weight, 0)
-
-            expert_data = torch.randn(128, 8)
-            loaded_weight = torch.randn(128, 4)
-            moe._load_w13(expert_data, 1, "w3", loaded_weight, 0)
-
-            expert_data = torch.randn(8, 128)
-            loaded_weight = torch.randn(128, 4)
-            moe._load_w13(expert_data, 1, "w3", loaded_weight, 0)
-
-    def test_load_w2_transpose(self):
-        with patch.object(AscendFusedMoE, "__init__",
-                          lambda self, *args, **kwargs: None):
-            moe = AscendFusedMoE(num_experts=4, top_k=2, hidden_size=8)
-            expert_data = torch.randn(128, 4)
-            loaded_weight = torch.randn(128, 8)
-            moe._load_w2(expert_data, 1, loaded_weight, 0)
-
-            expert_data = torch.randn(4, 128)
-            loaded_weight = torch.randn(128, 8)
-            moe._load_w2(expert_data, 1, loaded_weight, 0)
--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -281,9 +281,22 @@ class TestNPUWorker(TestBase):
        mock_allocator = MagicMock()
        mock_allocator_class.get_instance.return_value = mock_allocator

+        mock_hidden_size = MagicMock()
+        mock_hf_config = MagicMock()
+        mock_hf_config.hidden_size = mock_hidden_size
+        mock_model_config = MagicMock()
+        mock_model_config.hf_config = mock_hf_config
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config = mock_model_config
+
+        mock_model_runner = MagicMock()
+        mock_model_runner.model = MagicMock()
+
        # Create worker mock
        with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
            worker = NPUWorker()
+            worker.model_runner = mock_model_runner
+            worker.vllm_config = mock_vllm_config
            worker._sleep_saved_buffers = {}
            # Test wake_up method
            worker.wake_up(tags=["test_tag"])