[Bugfix] Fix the acceptance rates dorp issue when applying eagle3 to QuaRot model (#6914)

### What this PR does / why we need it? When using the target model after rotational quantization, the acceptance rate decreases because the fc weight of the draft model has not undergone rotational quantization(issue: #6445). We fixed this issue by performing rotation quantization on the fc weight of the draft model in the same way as the main model when loading draft model. - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
2026-03-04 11:29:49 +08:00
parent d431d7d526
commit 52d9086f64
4 changed files with 94 additions and 0 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -106,6 +106,7 @@ from vllm_ascend.eplb.eplb_updator import EplbUpdator
 from vllm_ascend.eplb.utils import model_register
 from vllm_ascend.ops.rotary_embedding import set_cos_and_sin, update_cos_sin
 from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
+from vllm_ascend.patch.worker.patch_qwen3_quarot import patch_load_weights
 from vllm_ascend.sample.sampler import AscendSampler
 from vllm_ascend.spec_decode import get_spec_decode_method
 from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
@@ -2422,6 +2423,8 @@ class NPUModelRunner(GPUModelRunner):
                model_register(self.model)
            if self.drafter:
                logger.info("Loading drafter model...")
+                if self.vllm_config.quant_config is not None:
+                    patch_load_weights(self.vllm_config)
                with get_tp_context(self.drafter):
                    self.drafter.load_model(self.model)
                if self.use_aux_hidden_state_outputs: