[main][Feature]Moe alltoallv communication optimization for unquantized RL training sence (#2088)

It comes from 0.9.1dev [0.9.1][Feature]Moe alltoallv communication optimization for unquantized RL training sence & alltoallv support dpo (#1547) - vLLM version: v0.10.0 - vLLM main: 97608dc276 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Signed-off-by: whx-sjtu <2952154980@qq.com> Signed-off-by: curryliu <120010041@link.cuhk.edu.cn> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: ChenTaoyu-SJTU <ctynb@qq.com> Signed-off-by: taoxudonghaha <justsheldon@163.com> Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com> Co-authored-by: curryliu <99582471+Irving11-BKN@users.noreply.github.com> Co-authored-by: Li Wang <wangli858794774@gmail.com> Co-authored-by: TaoYu Chen <ctynb@qq.com> Co-authored-by: taoxudonghaha <justsheldon@163.com> Co-authored-by: Shanshan Shen <467638484@qq.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-08-02 09:49:10 +08:00
parent f0c1f0c828
commit 6e00aed4d5
14 changed files with 1265 additions and 17 deletions
--- a/vllm_ascend/models/init.py
+++ b/vllm_ascend/models/init.py
@@ -40,7 +40,6 @@ def register_model():
        ModelRegistry.register_model(
            "DeepseekV3ForCausalLM",
            "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
-
    else:
        ModelRegistry.register_model(
            "DeepseekV2ForCausalLM",
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -15,8 +15,111 @@
 # limitations under the License.
 # Adapted from vllm/model_executor/models/qwen3_moe.py
 # This file is a part of the vllm-ascend project.
+from typing import Optional

-from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention,
+                                                  Qwen3MoeDecoderLayer,
+                                                  Qwen3MoeForCausalLM,
+                                                  Qwen3MoeMLP, Qwen3MoeModel)
+from vllm.model_executor.models.utils import (
+    extract_layer_index, make_empty_intermediate_tensors_factory, make_layers,
+    maybe_prefix)
+
+from vllm_ascend.ops.fused_moe import AscendSparseMoeBlock
+from vllm_ascend.platform import VllmConfig
+
+
+class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = AscendSparseMoeBlock(config=config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+
+@support_torch_compile
+class CustomQwen3MoeModel(Qwen3MoeModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: CustomQwen3MoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))


 class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
@@ -33,3 +136,20 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
        "experts":
        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = CustomQwen3MoeModel(vllm_config=vllm_config,
+                                         prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)