[main] Use AddRmsNormQuant ops in the custom model to optimize Qwen3's performance (#1806)

### What this PR does / why we need it? Optimizes the performance of the Qwen3 quantization model by registering a custom model and adding the AddRmsNormQuant operation. Subsequent PRs will focus on performance optimizations based on this custom model. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with existing test. - vLLM version: v0.9.2 - vLLM main: 8d0a01a5f2 Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-07-22 19:03:13 +08:00
parent ce4970eee0
commit 9a3bdf2162
5 changed files with 227 additions and 8 deletions
--- a/vllm_ascend/models/init.py
+++ b/vllm_ascend/models/init.py
@@ -11,6 +11,7 @@ def register_model():
    from .qwen2_5_vl import \
        AscendQwen2_5_VLForConditionalGeneration  # noqa: F401
    from .qwen2_vl import AscendQwen2VLForConditionalGeneration  # noqa: F401
+    from .qwen3 import CustomQwen3ForCausalLM  # noqa: F401

    ModelRegistry.register_model(
        "DeepSeekMTPModel",
@@ -53,6 +54,9 @@ def register_model():
        "Qwen3MoeForCausalLM",
        "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")

+    ModelRegistry.register_model(
+        "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")
+
    ModelRegistry.register_model(
        "PanguProMoEForCausalLM",
-        "vllm_ascend.models.pangu_moe:PanguProMoEForCausalLM")
+        "vllm_ascend.models.pangu_moe:PanguProMoEForCausalLM")