From 68d8d20ca249bd9b5c5ca510c08591192aa5c6b6 Mon Sep 17 00:00:00 2001
From: linfeng-yuan <1102311262@qq.com>
Date: Mon, 2 Mar 2026 18:17:01 +0800
Subject: [PATCH] [misc] move mxfp_compat into device to decouple from
 quantization init chain (#6918)

### What this PR does / why we need it?
`mxfp_compat` only provides dtype/symbol compatibility helpers for
different `torch_npu` versions, but it was placed under
`vllm_ascend.quantization`. Importing it from device/ops paths could
trigger `quantization/__init__.py` and pull in heavy quantization method
dependencies, increasing startup coupling and causing import-cycle risk
(especially on 310P paths).

### Does this PR introduce _any_ user-facing change?
No functional behavior change intended.

### How was this patch tested?
CI passed.

- vLLM version: v0.16.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7

---------

Signed-off-by: linfeng-yuan <1102311262@qq.com>
---
 vllm_ascend/_310p/model_runner_310p.py              | 2 +-
 vllm_ascend/device/device_op.py                     | 2 +-
 vllm_ascend/{quantization => device}/mxfp_compat.py | 0
 vllm_ascend/ops/fused_moe/moe_mlp.py                | 4 ++--
 vllm_ascend/quantization/methods/w8a8_mxfp8.py      | 4 ++--
 vllm_ascend/quantization/quant_parser.py            | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
 rename vllm_ascend/{quantization => device}/mxfp_compat.py (100%)

diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py
index f0cde39c..3dceff55 100644
--- a/vllm_ascend/_310p/model_runner_310p.py
+++ b/vllm_ascend/_310p/model_runner_310p.py
@@ -43,7 +43,7 @@ class NPUModelRunner310(NPUModelRunner):
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
         """
-        # 310P limitation: KV transfer is not supported.
+        # 310P limitation: KV transfer is not supported
         if self.vllm_config.kv_transfer_config is not None:
             raise ValueError("KV cache transfer is not supported for 310P.")
         if self.use_sparse:
diff --git a/vllm_ascend/device/device_op.py b/vllm_ascend/device/device_op.py
index ed5d87d2..9bb7b1a0 100644
--- a/vllm_ascend/device/device_op.py
+++ b/vllm_ascend/device/device_op.py
@@ -18,7 +18,7 @@
 import torch
 import torch_npu
 
-from vllm_ascend.quantization.mxfp_compat import (
+from vllm_ascend.device.mxfp_compat import (
     FLOAT4_E2M1FN_X2_DTYPE,
     FLOAT8_E8M0FNU_DTYPE,
     HIFLOAT8_DTYPE,
diff --git a/vllm_ascend/quantization/mxfp_compat.py b/vllm_ascend/device/mxfp_compat.py
similarity index 100%
rename from vllm_ascend/quantization/mxfp_compat.py
rename to vllm_ascend/device/mxfp_compat.py
diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py
index 830bb6af..aea25579 100644
--- a/vllm_ascend/ops/fused_moe/moe_mlp.py
+++ b/vllm_ascend/ops/fused_moe/moe_mlp.py
@@ -23,10 +23,10 @@ from vllm.triton_utils import HAS_TRITON
 
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.device.device_op import DeviceOperator
-from vllm_ascend.ops.activation import AscendSwigluOAIAndMul
-from vllm_ascend.quantization.mxfp_compat import (
+from vllm_ascend.device.mxfp_compat import (
     ensure_mxfp8_moe_available,
 )
+from vllm_ascend.ops.activation import AscendSwigluOAIAndMul
 from vllm_ascend.utils import (
     dispose_tensor,
     enable_custom_op,
diff --git a/vllm_ascend/quantization/methods/w8a8_mxfp8.py b/vllm_ascend/quantization/methods/w8a8_mxfp8.py
index bc25074d..d3859f1b 100644
--- a/vllm_ascend/quantization/methods/w8a8_mxfp8.py
+++ b/vllm_ascend/quantization/methods/w8a8_mxfp8.py
@@ -25,12 +25,12 @@ from vllm.distributed import get_ep_group
 from vllm.forward_context import get_forward_context
 
 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.quantization.mxfp_compat import (
+from vllm_ascend.device.mxfp_compat import (
     FLOAT8_E8M0FNU_DTYPE,
     ensure_mxfp8_linear_available,
     ensure_mxfp8_moe_available,
 )
+from vllm_ascend.ops.fused_moe.experts_selector import select_experts
 
 from .base import AscendLinearScheme, AscendMoEScheme, QuantType
 from .registry import register_scheme
diff --git a/vllm_ascend/quantization/quant_parser.py b/vllm_ascend/quantization/quant_parser.py
index 33144ce5..f75218c4 100644
--- a/vllm_ascend/quantization/quant_parser.py
+++ b/vllm_ascend/quantization/quant_parser.py
@@ -1,6 +1,6 @@
 import torch
 
-from vllm_ascend.quantization.mxfp_compat import (
+from vllm_ascend.device.mxfp_compat import (
     FLOAT4_E2M1FN_X2_DTYPE,
     FLOAT8_E8M0FNU_DTYPE,
     ensure_mxfp4_dtype_available,