From 688350a3bb26f7b4c92eb9c08a5898b399f6ca5a Mon Sep 17 00:00:00 2001
From: liu <99582471+Irving11-BKN@users.noreply.github.com>
Date: Mon, 4 Aug 2025 15:16:42 +0800
Subject: [PATCH] [bugfixed] fix the bug when run the inference of quantized
 ds-w8a8-mtp (#2134)

When run the inference of ds-w8a8-mtp, it reported 'ParamllelLMhead has
no attribute 'params_dtype''.

1. add wrapper of vocab_parallel_embedding, fixed the bugs when running
deepseek-w8a8-mtp

Signed-off-by: curryliu <120010041@link.cuhk.edu.cn>

- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad57f23f6a528ab01066998b41796a44340fd43d

---------

Signed-off-by: curryliu <120010041@link.cuhk.edu.cn>
---
 vllm_ascend/quantization/func_wrapper.py | 33 ++++++++++++++++++++++++
 vllm_ascend/quantization/quantizer.py    |  6 ++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/quantization/func_wrapper.py b/vllm_ascend/quantization/func_wrapper.py
index 77ecca2..8357695 100644
--- a/vllm_ascend/quantization/func_wrapper.py
+++ b/vllm_ascend/quantization/func_wrapper.py
@@ -22,6 +22,39 @@ import torch_npu
 from vllm.logger import logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, QuantizationConfig)
+
+
+# func refers to vocabParallelEmbedding.__init__
+def wrapper_vocab_parallel_embedding_init(func):
+
+    def init(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        params_dtype: Optional[torch.dtype] = None,
+        org_num_embeddings: Optional[int] = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        func(
+            self,
+            num_embeddings,
+            embedding_dim,
+            params_dtype,
+            org_num_embeddings,
+            padding_size,
+            quant_config,
+            prefix,
+        )
+        # TODO: Contact vLLM maintainers to add a `params_dtype` attribute to the `VocabParallelEmbedding` class.
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+    return init
 
 
 # func refers to RMSNorm.__init__
diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py
index e61593d..90c7512 100644
--- a/vllm_ascend/quantization/quantizer.py
+++ b/vllm_ascend/quantization/quantizer.py
@@ -22,7 +22,8 @@ from typing import Any, Dict, List, Optional
 
 from vllm.logger import logger
 
-from .func_wrapper import wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init
+from .func_wrapper import (wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init,
+                           wrapper_vocab_parallel_embedding_init)
 from .w4a8_dynamic import AscendW4A8DynamicLinearMethod
 from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
                    AscendW8A8LinearMethod)
@@ -75,6 +76,9 @@ class VLLMAscendQuantizer:
                 VLLMAscendQuantizer.apply_patch(
                     "vllm.model_executor.layers.layernorm.RMSNorm",
                     "forward_oot", [wrapper_rmsnorm_forward_oot])
+                VLLMAscendQuantizer.apply_patch(
+                    "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding",
+                    "__init__", [wrapper_vocab_parallel_embedding_init])
                 break
         VLLMAscendQuantizer.patched = True
         logger.info("Using the vLLM Ascend Quantizer version now!")