From 68d8d20ca249bd9b5c5ca510c08591192aa5c6b6 Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Mon, 2 Mar 2026 18:17:01 +0800 Subject: [PATCH] [misc] move mxfp_compat into device to decouple from quantization init chain (#6918) ### What this PR does / why we need it? `mxfp_compat` only provides dtype/symbol compatibility helpers for different `torch_npu` versions, but it was placed under `vllm_ascend.quantization`. Importing it from device/ops paths could trigger `quantization/__init__.py` and pull in heavy quantization method dependencies, increasing startup coupling and causing import-cycle risk (especially on 310P paths). ### Does this PR introduce _any_ user-facing change? No functional behavior change intended. ### How was this patch tested? CI passed. - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 --------- Signed-off-by: linfeng-yuan <1102311262@qq.com> --- vllm_ascend/_310p/model_runner_310p.py | 2 +- vllm_ascend/device/device_op.py | 2 +- vllm_ascend/{quantization => device}/mxfp_compat.py | 0 vllm_ascend/ops/fused_moe/moe_mlp.py | 4 ++-- vllm_ascend/quantization/methods/w8a8_mxfp8.py | 4 ++-- vllm_ascend/quantization/quant_parser.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) rename vllm_ascend/{quantization => device}/mxfp_compat.py (100%) diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py index f0cde39c..3dceff55 100644 --- a/vllm_ascend/_310p/model_runner_310p.py +++ b/vllm_ascend/_310p/model_runner_310p.py @@ -43,7 +43,7 @@ class NPUModelRunner310(NPUModelRunner): Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ - # 310P limitation: KV transfer is not supported. + # 310P limitation: KV transfer is not supported if self.vllm_config.kv_transfer_config is not None: raise ValueError("KV cache transfer is not supported for 310P.") if self.use_sparse: diff --git a/vllm_ascend/device/device_op.py b/vllm_ascend/device/device_op.py index ed5d87d2..9bb7b1a0 100644 --- a/vllm_ascend/device/device_op.py +++ b/vllm_ascend/device/device_op.py @@ -18,7 +18,7 @@ import torch import torch_npu -from vllm_ascend.quantization.mxfp_compat import ( +from vllm_ascend.device.mxfp_compat import ( FLOAT4_E2M1FN_X2_DTYPE, FLOAT8_E8M0FNU_DTYPE, HIFLOAT8_DTYPE, diff --git a/vllm_ascend/quantization/mxfp_compat.py b/vllm_ascend/device/mxfp_compat.py similarity index 100% rename from vllm_ascend/quantization/mxfp_compat.py rename to vllm_ascend/device/mxfp_compat.py diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py index 830bb6af..aea25579 100644 --- a/vllm_ascend/ops/fused_moe/moe_mlp.py +++ b/vllm_ascend/ops/fused_moe/moe_mlp.py @@ -23,10 +23,10 @@ from vllm.triton_utils import HAS_TRITON from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.device.device_op import DeviceOperator -from vllm_ascend.ops.activation import AscendSwigluOAIAndMul -from vllm_ascend.quantization.mxfp_compat import ( +from vllm_ascend.device.mxfp_compat import ( ensure_mxfp8_moe_available, ) +from vllm_ascend.ops.activation import AscendSwigluOAIAndMul from vllm_ascend.utils import ( dispose_tensor, enable_custom_op, diff --git a/vllm_ascend/quantization/methods/w8a8_mxfp8.py b/vllm_ascend/quantization/methods/w8a8_mxfp8.py index bc25074d..d3859f1b 100644 --- a/vllm_ascend/quantization/methods/w8a8_mxfp8.py +++ b/vllm_ascend/quantization/methods/w8a8_mxfp8.py @@ -25,12 +25,12 @@ from vllm.distributed import get_ep_group from vllm.forward_context import get_forward_context from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.ops.fused_moe.experts_selector import select_experts -from vllm_ascend.quantization.mxfp_compat import ( +from vllm_ascend.device.mxfp_compat import ( FLOAT8_E8M0FNU_DTYPE, ensure_mxfp8_linear_available, ensure_mxfp8_moe_available, ) +from vllm_ascend.ops.fused_moe.experts_selector import select_experts from .base import AscendLinearScheme, AscendMoEScheme, QuantType from .registry import register_scheme diff --git a/vllm_ascend/quantization/quant_parser.py b/vllm_ascend/quantization/quant_parser.py index 33144ce5..f75218c4 100644 --- a/vllm_ascend/quantization/quant_parser.py +++ b/vllm_ascend/quantization/quant_parser.py @@ -1,6 +1,6 @@ import torch -from vllm_ascend.quantization.mxfp_compat import ( +from vllm_ascend.device.mxfp_compat import ( FLOAT4_E2M1FN_X2_DTYPE, FLOAT8_E8M0FNU_DTYPE, ensure_mxfp4_dtype_available,