[v0.18.0]feat(quant): add C8 INT8 KV cache support for GQA attention models (#7474) (#8007)

backport of #7474 This PR adds C8 (INT8) KV cache quantization support for standard GQA attention models (e.g., Qwen3-32B W8A8C8). C8 uses static per-channel quantization scales to store KV cache in INT8, reducing KV cache memory by ~50% compared to BF16, enabling higher batch concurrency and longer context lengths on the same hardware. **Key changes:** 1. **`attention_v1.py`** — New `AscendC8AttentionBackendImpl` subclass of `AscendAttentionBackendImpl`: - `_prepare_c8_scales`: Shards per-channel scales/offsets to the current TP rank and pre-computes BF16 BNSD-shaped antiquant tensors (one-time per layer). - `_quantize_kv_to_int8`: Quantizes BF16 K/V to INT8 before `reshape_and_cache`, using pre-cached inverse scales. - `_forward_c8_decode`: FIA V1 BNSD paged attention with native INT8 KV and `perchannel` antiquant mode. - `_forward_c8_chunked_prefill`: Splits decode (FIA V1 BNSD paged INT8) and prefill (FIA V1 TND float) into two kernel calls. - `_forward_c8_fused_infer_attention`: Handles `PrefillNoCache` and `PrefillCacheHit` states. 2. **`quantization/methods/kv_c8.py`** — New `AscendC8KVCacheAttentionMethod` scheme: - Creates `k/v_cache_scale/offset` parameters via `_c8_kv_scale_weight_loader`, which handles per-channel scale shapes and lazy resizing. - Sets `layer.kv_cache_torch_dtype = torch.int8` so `get_kv_cache_spec()` returns INT8 dtype automatically. - Upgrades `layer.impl` to `AscendC8AttentionBackendImpl` via class surgery. 3. **`quantization/modelslim_config.py`** — C8 branch in `get_quant_method()` activates when `kv_cache_type == "C8"` in `quant_model_description.json`. 4. **`patch/worker/patch_qwen3_c8.py`** — Intercepts per-channel C8 scale/offset weights before `AutoWeightsLoader` discards them, routing them to the parameters created by `AscendC8KVCacheAttentionMethod`. 5. **`tests/ut/quantization/test_kv_c8.py`** — Unit tests covering `_c8_kv_scale_weight_loader`, `AscendC8KVCacheAttentionMethod`, and `AscendC8AttentionBackendImpl` scale helpers. Yes. Users can now serve Qwen3-32B W8A8C8 quantized models with INT8 KV cache on Ascend NPU. The model checkpoint must contain a `quant_model_description.json` with `"kv_cache_type": "C8"` and per-channel scale/offset tensors in safetensors. No changes to the serving CLI — the feature activates automatically when the quantization config is detected. Benchmarked with `vllm serve` (TP=8, `max_num_seqs=256`, `max_model_len=131072`, `enable_chunked_prefill=true`) + `random_bench` (input_len=10240, output_len=2048, 960 prompts, max_concurrency=192): ``` ============ Serving Benchmark Result ============ Successful requests: 960 Failed requests: 0 Maximum request concurrency: 192 Benchmark duration (s): 1359.81 Total input tokens: 9830400 Total generated tokens: 1966080 Request throughput (req/s): 0.71 Output token throughput (tok/s): 1445.85 Peak output token throughput (tok/s): 2304.00 Total token throughput (tok/s): 8675.12 ---------------Time to First Token---------------- Mean TTFT (ms): 24598.51 Median TTFT (ms): 23167.02 P50 TTFT (ms): 23167.02 P90 TTFT (ms): 47717.08 P99 TTFT (ms): 84402.61 -----Time per Output Token (excl. 1st token)------ Mean TPOT (ms): 120.76 Median TPOT (ms): 121.50 P50 TPOT (ms): 121.50 P90 TPOT (ms): 127.05 P99 TPOT (ms): 130.13 ---------------Inter-token Latency---------------- Mean ITL (ms): 120.70 Median ITL (ms): 90.34 P50 ITL (ms): 90.34 P90 ITL (ms): 93.79 P99 ITL (ms): 101.80 ================================================== ``` All attention states verified: `PrefillNoCache`, `PrefillCacheHit`, `ChunkedPrefill`, `DecodeOnly`. - vLLM version: v0.17.0 - vLLM main: 8b6325758c Signed-off-by: lico67373 <918688502@qq.com> Co-authored-by: LICO67373 <110013619+LICO1314@users.noreply.github.com>
2026-04-08 10:51:58 +08:00
parent fbd5d0fd55
commit 044d4c3974
8 changed files with 761 additions and 8 deletions
--- a/tests/ut/quantization/test_kv_c8.py
+++ b/tests/ut/quantization/test_kv_c8.py
@@ -1,7 +1,9 @@
 import unittest
 import torch
 import torch.nn as nn
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
 from tests.ut.base import TestBase
 class TestWeightLoader(unittest.TestCase):
@@ -10,7 +12,7 @@ class TestWeightLoader(unittest.TestCase):
    def setUp(self):
        """Set up test environment before each test"""
        # Import the module under test
-        from vllm_ascend.quantization.methods.kv_c8 import weight_loader
+        from vllm_ascend.quantization.methods.kv_c8 import _fa_quant_weight_loader as weight_loader
        self.weight_loader = weight_loader
        # Mock distributed functions
@@ -295,7 +297,7 @@ class TestAscendFAQuantAttentionMethodCreateWeights(unittest.TestCase):
            method.create_weights(self.layer)
            # Import weight_loader for comparison
-            from vllm_ascend.quantization.methods.kv_c8 import weight_loader
+            from vllm_ascend.quantization.methods.kv_c8 import _fa_quant_weight_loader as weight_loader
            # Verify each parameter exists and has weight_loader
            self.assertTrue(hasattr(self.layer.fa_q, "scale"))
@@ -440,7 +442,7 @@ class TestIntegration(unittest.TestCase):
        v_offset = torch.randint(-128, 127, (1, 1), dtype=torch.int8)
        # Load weights using weight_loader
-        from vllm_ascend.quantization.methods.kv_c8 import weight_loader
+        from vllm_ascend.quantization.methods.kv_c8 import _fa_quant_weight_loader as weight_loader
        with torch.no_grad():
            weight_loader(layer.fa_q.scale, q_scale)
@@ -464,5 +466,224 @@ class TestIntegration(unittest.TestCase):
        self.assertTrue(hasattr(layer, "quant_kscale"))
 class TestC8KVScaleWeightLoader(TestBase):
    """Tests for _c8_kv_scale_weight_loader in kv_c8.py."""
    def setUp(self):
        from vllm_ascend.quantization.methods.kv_c8 import _c8_kv_scale_weight_loader
        self.loader = _c8_kv_scale_weight_loader
    def test_shape_match_copies_value(self):
        param = nn.Parameter(torch.ones(4, dtype=torch.float32), requires_grad=False)
        loaded = torch.tensor([1.0, 2.0, 3.0, 4.0])
        self.loader(param, loaded)
        self.assertTrue(torch.allclose(param.data, loaded.float()))
    def test_shape_mismatch_resizes_param(self):
        param = nn.Parameter(torch.ones(1, dtype=torch.float32), requires_grad=False)
        loaded = torch.arange(8, dtype=torch.float32)
        self.loader(param, loaded)
        self.assertEqual(param.data.shape, (8,))
        self.assertTrue(torch.allclose(param.data, loaded))
    def test_squeeze_before_compare(self):
        param = nn.Parameter(torch.ones(4, dtype=torch.float32), requires_grad=False)
        loaded = torch.arange(4, dtype=torch.float32).unsqueeze(0)  # shape [1, 4]
        self.loader(param, loaded)
        self.assertEqual(param.data.shape, (4,))
    def test_dtype_preserved_as_param_dtype(self):
        param = nn.Parameter(torch.ones(4, dtype=torch.float32), requires_grad=False)
        loaded = torch.arange(4, dtype=torch.float16)
        self.loader(param, loaded)
        self.assertEqual(param.data.dtype, torch.float32)
 class TestAscendC8KVCacheAttentionMethod(TestBase):
    """Tests for AscendC8KVCacheAttentionMethod in kv_c8.py."""
    def _make_method(self):
        from vllm_ascend.quantization.methods.kv_c8 import AscendC8KVCacheAttentionMethod
        return AscendC8KVCacheAttentionMethod(quant_description={}, prefix="model.layers.0.self_attn.attn")
    def _make_layer_with_impl(self):
        layer = nn.Module()
        layer.impl = MagicMock()
        return layer
    def test_create_weights_sets_kv_cache_torch_dtype(self):
        method = self._make_method()
        layer = self._make_layer_with_impl()
        method.create_weights(layer)
        self.assertEqual(layer.kv_cache_torch_dtype, torch.int8)
    def test_create_weights_registers_scale_offset_params(self):
        method = self._make_method()
        layer = self._make_layer_with_impl()
        method.create_weights(layer)
        self.assertIsInstance(layer.k_cache_scale, nn.Parameter)
        self.assertIsInstance(layer.k_cache_offset, nn.Parameter)
        self.assertIsInstance(layer.v_cache_scale, nn.Parameter)
        self.assertIsInstance(layer.v_cache_offset, nn.Parameter)
        self.assertFalse(layer.k_cache_scale.requires_grad)
        self.assertFalse(layer.v_cache_offset.requires_grad)
    def test_create_weights_initial_values(self):
        method = self._make_method()
        layer = self._make_layer_with_impl()
        method.create_weights(layer)
        self.assertEqual(layer.k_cache_scale.data.item(), 1.0)
        self.assertEqual(layer.v_cache_scale.data.item(), 1.0)
        self.assertEqual(layer.k_cache_offset.data.item(), 0.0)
        self.assertEqual(layer.v_cache_offset.data.item(), 0.0)
    def test_create_weights_assigns_weight_loader(self):
        from vllm_ascend.quantization.methods.kv_c8 import _c8_kv_scale_weight_loader
        method = self._make_method()
        layer = self._make_layer_with_impl()
        method.create_weights(layer)
        self.assertIs(layer.k_cache_scale.weight_loader, _c8_kv_scale_weight_loader)
        self.assertIs(layer.v_cache_scale.weight_loader, _c8_kv_scale_weight_loader)
        self.assertIs(layer.k_cache_offset.weight_loader, _c8_kv_scale_weight_loader)
        self.assertIs(layer.v_cache_offset.weight_loader, _c8_kv_scale_weight_loader)
    def test_process_weights_after_loading_flattens(self):
        method = self._make_method()
        layer = nn.Module()
        layer.k_cache_scale = nn.Parameter(torch.ones(2, 4), requires_grad=False)
        layer.k_cache_offset = nn.Parameter(torch.zeros(2, 4), requires_grad=False)
        layer.v_cache_scale = nn.Parameter(torch.ones(2, 4), requires_grad=False)
        layer.v_cache_offset = nn.Parameter(torch.zeros(2, 4), requires_grad=False)
        method.process_weights_after_loading(layer)
        self.assertEqual(layer.k_cache_scale.data.dim(), 1)
        self.assertEqual(layer.k_cache_scale.data.shape[0], 8)
        self.assertEqual(layer.v_cache_offset.data.dim(), 1)
    def test_apply_raises_runtime_error(self):
        method = self._make_method()
        layer = MagicMock()
        with self.assertRaises(RuntimeError):
            method.apply(layer, MagicMock(), MagicMock(), MagicMock(), None, None, None, None, None)
 class TestAscendC8AttentionBackendImplScales(TestBase):
    """Tests for AscendC8AttentionBackendImpl scale helpers."""
    def _make_impl(self, num_kv_heads=4, head_size=8):
        from vllm_ascend.attention.attention_v1 import AscendC8AttentionBackendImpl
        impl = object.__new__(AscendC8AttentionBackendImpl)
        impl.num_heads = num_kv_heads
        impl.num_kv_heads = num_kv_heads
        impl.head_size = head_size
        impl.scale = 1.0
        impl.key_cache = None
        impl.value_cache = None
        return impl
    def _make_layer(self, num_kv_heads=4, head_size=8):
        layer = nn.Module()
        layer.k_cache_scale = nn.Parameter(
            torch.ones(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
        )
        layer.k_cache_offset = nn.Parameter(
            torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
        )
        layer.v_cache_scale = nn.Parameter(
            torch.ones(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
        )
        layer.v_cache_offset = nn.Parameter(
            torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
        )
        return layer
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
    def test_prepare_c8_scales_runs_once(self, mock_tp_size, mock_tp_rank):
        impl = self._make_impl()
        layer = self._make_layer()
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        self.assertTrue(hasattr(layer, "_c8_scales_prepared"))
        self.assertTrue(layer._c8_scales_prepared)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
    def test_prepare_c8_scales_idempotent(self, mock_tp_size, mock_tp_rank):
        impl = self._make_impl()
        layer = self._make_layer()
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        k_scale_after_first = layer._c8_k_scale.clone()
        layer.k_cache_scale.data = torch.ones(32, dtype=torch.float32) * 99
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        self.assertTrue(torch.allclose(layer._c8_k_scale, k_scale_after_first))
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
    def test_prepare_c8_scales_creates_bnsd_shape(self, mock_tp_size, mock_tp_rank):
        num_kv_heads, head_size = 4, 8
        impl = self._make_impl(num_kv_heads, head_size)
        layer = self._make_layer(num_kv_heads, head_size)
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        self.assertEqual(layer._c8_k_aq_scale.shape, (1, num_kv_heads, 1, head_size))
        self.assertEqual(layer._c8_v_aq_scale.shape, (1, num_kv_heads, 1, head_size))
        self.assertEqual(layer._c8_k_aq_scale.dtype, torch.bfloat16)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
    def test_quantize_kv_to_int8_output_dtype(self, mock_tp_size, mock_tp_rank):
        num_kv_heads, head_size = 4, 8
        impl = self._make_impl(num_kv_heads, head_size)
        layer = self._make_layer(num_kv_heads, head_size)
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        num_tokens = 6
        key = torch.zeros(num_tokens, num_kv_heads, head_size, dtype=torch.bfloat16)
        value = torch.zeros(num_tokens, num_kv_heads, head_size, dtype=torch.bfloat16)
        key_q, value_q = impl._quantize_kv_to_int8(key, value, layer, num_tokens)
        self.assertEqual(key_q.dtype, torch.int8)
        self.assertEqual(value_q.dtype, torch.int8)
        self.assertEqual(key_q.shape, key.shape)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
    def test_quantize_kv_to_int8_formula(self, mock_tp_size, mock_tp_rank):
        """With scale=2.0, offset=0: q = round(x / 2)."""
        num_kv_heads, head_size = 1, 4
        impl = self._make_impl(num_kv_heads, head_size)
        layer = nn.Module()
        scale_val = torch.full((num_kv_heads * head_size,), 2.0, dtype=torch.float32)
        layer.k_cache_scale = nn.Parameter(scale_val.clone(), requires_grad=False)
        layer.k_cache_offset = nn.Parameter(torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False)
        layer.v_cache_scale = nn.Parameter(scale_val.clone(), requires_grad=False)
        layer.v_cache_offset = nn.Parameter(torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False)
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        key = torch.full((1, num_kv_heads, head_size), 4.0, dtype=torch.bfloat16)
        value = torch.full((1, num_kv_heads, head_size), 4.0, dtype=torch.bfloat16)
        key_q, _ = impl._quantize_kv_to_int8(key, value, layer, 1)
        self.assertTrue(torch.all(key_q[0] == 2))
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
    def test_dequant_paged_kv_to_dense_round_trip(self, mock_tp_size, mock_tp_rank):
        """With scale=1, offset=0: dequant(int8) == float(int8)."""
        num_kv_heads, head_size = 2, 4
        block_size = 32
        num_blocks = 2
        H = num_kv_heads * head_size
        impl = self._make_impl(num_kv_heads, head_size)
        layer = self._make_layer(num_kv_heads, head_size)
        impl._prepare_c8_scales(layer, torch.device("cpu"))
        key_int8 = torch.randint(-10, 10, (num_blocks, block_size, H), dtype=torch.int8)
        value_int8 = torch.randint(-10, 10, (num_blocks, block_size, H), dtype=torch.int8)
        seq_lens = [32, 32]
        block_table = torch.tensor([[0], [1]], dtype=torch.long)
        dense_k, dense_v = impl._dequant_paged_kv_to_dense(
            key_int8, value_int8, block_table, seq_lens, torch.float32, layer
        )
        expected_k = key_int8.view(-1, num_kv_heads, head_size).float()
        self.assertEqual(dense_k.shape, (64, num_kv_heads, head_size))
        self.assertTrue(torch.allclose(dense_k, expected_k))
 if __name__ == "__main__":
    unittest.main(verbosity=2)
--- a/tests/ut/quantization/test_modelslim_config.py
+++ b/tests/ut/quantization/test_modelslim_config.py
@@ -16,6 +16,7 @@ from vllm_ascend.quantization.modelslim_config import (
 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
 from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 class TestAscendModelSlimConfig(TestBase):
@@ -125,6 +126,19 @@ class TestAscendModelSlimConfig(TestBase):
                attention_layer, "layers.1.attn")
            self.assertIs(method, mock_ascend_kvcache.return_value)
    def test_get_quant_method_for_c8_kv_cache_attention(self):
        c8_config = AscendModelSlimConfig({"kv_cache_type": "C8"})
        attention_layer = MagicMock(spec=AttentionLayerBase)
        mock_vllm_config = MagicMock()
        mock_vllm_config.model_config.hf_config.model_type = None
        with patch("vllm_ascend.quantization.modelslim_config.get_current_vllm_config", return_value=mock_vllm_config), \
            patch("vllm_ascend.quantization.method_adapters.AscendKVCacheMethod", return_value=MagicMock()) as mock_kvcache:
            method = c8_config.get_quant_method(attention_layer, "model.layers.0.self_attn.attn")
            self.assertIs(method, mock_kvcache.return_value)
            args, _ = mock_kvcache.call_args
            from vllm_ascend.quantization.methods.kv_c8 import AscendC8KVCacheAttentionMethod
            self.assertIsInstance(args[0], AscendC8KVCacheAttentionMethod)
    def test_get_quant_method_for_fused_moe(self):
        fused_moe_layer = MagicMock(spec=FusedMoE)
        fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -22,6 +22,7 @@ import torch
 import torch_npu
 import vllm.envs as envs_vllm
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backend import (  # type: ignore
    AttentionBackend,
@@ -978,3 +979,364 @@ class AscendAttentionBackendImpl(AttentionImpl):
            attn_output = self.forward_impl(query, key, value, kv_cache, attn_metadata, output)
        output[:num_tokens] = attn_output[:num_tokens]
        return output
 class AscendC8AttentionBackendImpl(AscendAttentionBackendImpl):
    """Attention backend implementation for INT8 KV cache (C8/QuaRot) models.
    This subclass handles static per-channel INT8 KV cache quantization.
    It is activated via class surgery in AscendC8KVCacheAttentionMethod.create_weights
    (vllm_ascend/quantization/methods/kv_c8.py)
    so that C8 attention layers automatically use this forward path.
    """
    def forward(
        self,
        layer: AttentionLayer,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache: tuple[torch.Tensor],
        attn_metadata: AscendMetadata,
        output: torch.Tensor | None = None,
        output_scale: torch.Tensor | None = None,
        output_block_scale: torch.Tensor | None = None,
    ) -> torch.Tensor:
        assert output is not None, "Output tensor must be provided."
        if output_scale is not None or output_block_scale is not None:
            raise NotImplementedError("fused output quantization is not yet supported for AscendC8AttentionBackendImpl")
        num_tokens = query.shape[0]
        if attn_metadata is None:
            return output.fill_(0)
        float_key, float_value = None, None
        if key is not None and value is not None:
            if attn_metadata.attn_state != AscendAttentionState.DecodeOnly:
                float_key, float_value = key, value
            key, value = self._quantize_kv_to_int8(key, value, layer, attn_metadata.num_actual_tokens)
            query, key, value, _ = self.reshape_and_cache(query, key, value, kv_cache, attn_metadata, output)
        if attn_metadata.model_runner_type == "pooling":
            attn_output = self._forward_encoder_attention(query, key, value, attn_metadata, output)
            output[:num_tokens] = attn_output[:num_tokens]
            return output
        self._prepare_c8_scales(layer, query.device)
        if attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
            return self._forward_c8_decode(query, attn_metadata, output, layer)
        elif attn_metadata.attn_state == AscendAttentionState.ChunkedPrefill:
            return self._forward_c8_chunked_prefill(query, float_key, float_value, attn_metadata, output, layer)
        else:
            return self._forward_c8_fused_infer_attention(
                query,
                float_key if float_key is not None else key,
                float_value if float_value is not None else value,
                attn_metadata,
                output,
                layer,
            )
    def _prepare_c8_scales(self, layer: AttentionLayer, device: torch.device) -> None:
        """Shard per-channel C8 scales/offsets to this TP rank and pre-compute
        BF16 BNSD antiquant tensors for FIA V1 decode fast path.
        """
        if hasattr(layer, "_c8_scales_prepared"):
            return
        def _shard_and_reshape(raw: torch.Tensor) -> torch.Tensor:
            if raw.numel() == 1:
                return raw.to(device=device)
            expected = self.num_kv_heads * self.head_size
            if raw.numel() != expected:
                total_kv_heads = raw.numel() // self.head_size
                tp_rank = get_tensor_model_parallel_rank()
                tp_size = get_tensor_model_parallel_world_size()
                kv_head_start = tp_rank * total_kv_heads // tp_size
                raw = raw.view(total_kv_heads, self.head_size)[
                    kv_head_start : kv_head_start + self.num_kv_heads
                ].contiguous()
            return raw.view(1, self.num_kv_heads, self.head_size).to(device=device)
        layer._c8_k_scale = _shard_and_reshape(layer.k_cache_scale.data)
        layer._c8_k_offset = _shard_and_reshape(layer.k_cache_offset.data)
        layer._c8_v_scale = _shard_and_reshape(layer.v_cache_scale.data)
        layer._c8_v_offset = _shard_and_reshape(layer.v_cache_offset.data)
        bnsd = (1, self.num_kv_heads, 1, self.head_size)
        layer._c8_k_aq_scale = layer._c8_k_scale.to(torch.bfloat16).view(bnsd).contiguous()
        layer._c8_k_aq_offset = layer._c8_k_offset.to(torch.bfloat16).view(bnsd).contiguous()
        layer._c8_v_aq_scale = layer._c8_v_scale.to(torch.bfloat16).view(bnsd).contiguous()
        layer._c8_v_aq_offset = layer._c8_v_offset.to(torch.bfloat16).view(bnsd).contiguous()
        layer._c8_k_inv_scale_bf16 = (1.0 / layer._c8_k_scale).to(torch.bfloat16)
        layer._c8_k_offset_bf16 = layer._c8_k_offset.to(torch.bfloat16)
        layer._c8_v_inv_scale_bf16 = (1.0 / layer._c8_v_scale).to(torch.bfloat16)
        layer._c8_v_offset_bf16 = layer._c8_v_offset.to(torch.bfloat16)
        layer._c8_scales_prepared = True
    def _dequant_paged_kv_to_dense(
        self,
        key: torch.Tensor,
        value: torch.Tensor,
        block_table: torch.Tensor,
        seq_lens: list,
        target_dtype: torch.dtype,
        layer,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Gather paged INT8 KV blocks and dequantize to target_dtype."""
        batch_size = block_table.shape[0]
        block_size = key.shape[1]
        H = key.shape[2]
        max_blocks_per_seq = block_table.shape[1]
        max_tokens_padded = max_blocks_per_seq * block_size
        flat_ids = block_table.reshape(-1)
        gathered_k = key[flat_ids].view(batch_size, max_tokens_padded, H)
        gathered_v = value[flat_ids].view(batch_size, max_tokens_padded, H)
        seq_lens_t = torch.tensor(seq_lens, dtype=torch.long, device=key.device)
        positions = torch.arange(max_tokens_padded, dtype=torch.long, device=key.device)
        valid_mask = (positions.unsqueeze(0) < seq_lens_t.unsqueeze(1)).view(-1)
        dense_k = gathered_k.view(-1, H)[valid_mask]
        dense_v = gathered_v.view(-1, H)[valid_mask]
        dense_k = dense_k.view(-1, self.num_kv_heads, self.head_size)
        dense_v = dense_v.view(-1, self.num_kv_heads, self.head_size)
        k_scale = layer._c8_k_scale.to(target_dtype)
        k_offset = layer._c8_k_offset.to(target_dtype)
        v_scale = layer._c8_v_scale.to(target_dtype)
        v_offset = layer._c8_v_offset.to(target_dtype)
        dense_k = (dense_k.to(target_dtype) - k_offset) * k_scale
        dense_v = (dense_v.to(target_dtype) - v_offset) * v_scale
        return dense_k, dense_v
    def _quantize_kv_to_int8(
        self,
        key: torch.Tensor,
        value: torch.Tensor,
        layer: AttentionLayer,
        num_actual_tokens: int,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Quantize K/V from float to INT8 using static per-channel C8 scales."""
        self._prepare_c8_scales(layer, key.device)
        actual_key = key[:num_actual_tokens]
        actual_value = value[:num_actual_tokens]
        k_int8 = torch.clamp(
            torch.round(actual_key * layer._c8_k_inv_scale_bf16 + layer._c8_k_offset_bf16),
            -128,
            127,
        ).to(torch.int8)
        v_int8 = torch.clamp(
            torch.round(actual_value * layer._c8_v_inv_scale_bf16 + layer._c8_v_offset_bf16),
            -128,
            127,
        ).to(torch.int8)
        return k_int8, v_int8
    def _forward_c8_decode(
        self,
        query: torch.Tensor,
        attn_metadata: AscendMetadata,
        output: torch.Tensor,
        layer: AttentionLayer,
    ) -> torch.Tensor:
        """C8 decode via FIA V1 BNSD with native paged INT8 KV + perchannel antiquant."""
        num_block, block_size, _, _ = self.key_cache.shape  # type: ignore[attr-defined]
        assert block_size % 32 == 0, f"C8 INT8 KV cache requires block_size to be a multiple of 32, got {block_size}"
        key = self.key_cache.view(num_block, block_size, -1)  # type: ignore[attr-defined]
        value = self.value_cache.view(num_block, block_size, -1)  # type: ignore[attr-defined]
        batch_size = len(attn_metadata.seq_lens_list)
        attn_output, _ = torch_npu.npu_fused_infer_attention_score(
            query[:batch_size].unsqueeze(2),
            key,
            value,
            key_antiquant_scale=layer._c8_k_aq_scale,
            key_antiquant_offset=layer._c8_k_aq_offset,
            value_antiquant_scale=layer._c8_v_aq_scale,
            value_antiquant_offset=layer._c8_v_aq_offset,
            block_table=attn_metadata.block_tables,
            actual_seq_lengths_kv=attn_metadata.seq_lens_list,
            num_heads=self.num_heads,
            num_key_value_heads=self.num_kv_heads,
            input_layout="BNSD",
            scale=self.scale,
            block_size=block_size,
            key_antiquant_mode=0,
            value_antiquant_mode=0,
            sparse_mode=0,
        )
        attn_output = attn_output.squeeze(2)
        output[:batch_size] = attn_output
        return output
    def _forward_c8_chunked_prefill(
        self,
        query: torch.Tensor,
        float_key: torch.Tensor | None,
        float_value: torch.Tensor | None,
        attn_metadata: AscendMetadata,
        output: torch.Tensor,
        layer: AttentionLayer,
    ) -> torch.Tensor:
        """C8 ChunkedPrefill: decode via FIA V1 BNSD paged INT8 (zero gather),
        prefill via FIA V1 TND with float KV (new) or gather+dequant (continuing).
        """
        num_decode_tokens = attn_metadata.num_decode_tokens
        num_decodes = attn_metadata.num_decodes
        actual_seq_qlen = attn_metadata.actual_seq_lengths_q
        num_tokens = int(actual_seq_qlen[-1])  # type: ignore[index]
        if num_decode_tokens > 0:
            num_block, block_size, _, _ = self.key_cache.shape  # type: ignore[attr-defined]
            assert block_size % 32 == 0, (
                f"C8 INT8 KV cache requires block_size to be a multiple of 32, got {block_size}"
            )
            kv_k = self.key_cache.view(num_block, block_size, -1)  # type: ignore[attr-defined]
            kv_v = self.value_cache.view(num_block, block_size, -1)  # type: ignore[attr-defined]
            attn_out, _ = torch_npu.npu_fused_infer_attention_score(
                query[:num_decode_tokens].unsqueeze(2),
                kv_k,
                kv_v,
                key_antiquant_scale=layer._c8_k_aq_scale,
                key_antiquant_offset=layer._c8_k_aq_offset,
                value_antiquant_scale=layer._c8_v_aq_scale,
                value_antiquant_offset=layer._c8_v_aq_offset,
                block_table=attn_metadata.block_tables[:num_decodes],
                actual_seq_lengths_kv=attn_metadata.seq_lens_list[:num_decodes],
                num_heads=self.num_heads,
                num_key_value_heads=self.num_kv_heads,
                input_layout="BNSD",
                scale=self.scale,
                block_size=block_size,
                key_antiquant_mode=0,
                value_antiquant_mode=0,
                sparse_mode=0,
            )
            output[:num_decode_tokens] = attn_out.squeeze(2)
        if attn_metadata.num_prefills > 0:
            prefill_q = query[num_decode_tokens:num_tokens]
            prefill_seq_qlen = [
                actual_seq_qlen[i] - num_decode_tokens for i in range(num_decodes, len(actual_seq_qlen))
            ]
            all_new_prefill = True
            for i in range(num_decodes, len(attn_metadata.seq_lens_list)):
                q_start = actual_seq_qlen[i - 1] if i > 0 else 0
                qlen_i = actual_seq_qlen[i] - q_start
                if attn_metadata.seq_lens_list[i] > qlen_i:
                    all_new_prefill = False
                    break
            if all_new_prefill and float_key is not None and float_value is not None:
                prefill_k = float_key[num_decode_tokens:num_tokens]
                prefill_v = float_value[num_decode_tokens:num_tokens]
                prefill_seq_kvlen = prefill_seq_qlen
            else:
                num_block, blk_size, _, _ = self.key_cache.shape  # type: ignore[attr-defined]
                paged_k = self.key_cache.view(num_block, blk_size, -1)  # type: ignore[attr-defined]
                paged_v = self.value_cache.view(num_block, blk_size, -1)  # type: ignore[attr-defined]
                prefill_bt = attn_metadata.block_tables[num_decodes:]
                prefill_sl = attn_metadata.seq_lens_list[num_decodes:]
                prefill_k, prefill_v = self._dequant_paged_kv_to_dense(
                    paged_k, paged_v, prefill_bt, prefill_sl, query.dtype, layer
                )
                prefill_seq_kvlen = torch.tensor(prefill_sl, dtype=torch.int32).cumsum(dim=0)
            # block_table is None for prefill; FIA ignores block_size in this case.
            # Use cache block_size for consistency rather than a magic number.
            cache_block_size = self.key_cache.shape[1]  # type: ignore[attr-defined]
            attn_out, _ = torch_npu.npu_fused_infer_attention_score(
                query=prefill_q,
                key=prefill_k,
                value=prefill_v,
                atten_mask=attn_metadata.attn_mask,
                block_table=None,
                input_layout="TND",
                block_size=cache_block_size,
                actual_seq_lengths=prefill_seq_qlen,
                actual_seq_lengths_kv=prefill_seq_kvlen,
                num_key_value_heads=self.num_kv_heads,
                num_heads=self.num_heads,
                scale=self.scale,
                sparse_mode=3,
            )
            n_prefill = num_tokens - num_decode_tokens
            attn_out = attn_out.view(n_prefill, self.num_heads, self.head_size)
            output[num_decode_tokens:num_tokens] = attn_out[:n_prefill]
        return output
    def _forward_c8_fused_infer_attention(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        attn_metadata: AscendMetadata,
        output: torch.Tensor,
        layer: AttentionLayer,
    ):
        """C8 FIA V1 TND for prefill states (PrefillNoCache uses float KV directly,
        PrefillCacheHit gathers + dequants paged INT8 KV).
        """
        self._prepare_c8_scales(layer, query.device)
        key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata)
        actual_seq_qlen = attn_metadata.actual_seq_lengths_q
        num_tokens = int(actual_seq_qlen[-1])  # type: ignore[index]
        query = query[:num_tokens]
        if (
            attn_metadata.attn_state == AscendAttentionState.PrefillNoCache
            and self.attn_type != AttentionType.ENCODER_DECODER
        ):
            key = key[:num_tokens]
            value = value[:num_tokens]
        if key.dtype == torch.int8:
            if block_table is not None:
                seq_lens = (
                    actual_seq_lengths_kv if isinstance(actual_seq_lengths_kv, list) else actual_seq_lengths_kv.tolist()
                )
                key, value = self._dequant_paged_kv_to_dense(key, value, block_table, seq_lens, query.dtype, layer)
                block_table = None
                # block_table is None after dequant; FIA ignores block_size.
                # Use cache block_size for consistency rather than a magic number.
                block_size = self.key_cache.shape[1]  # type: ignore[attr-defined]
                actual_seq_lengths_kv = torch.tensor(seq_lens, dtype=torch.int32).cumsum(dim=0)
            else:
                qdt = query.dtype
                k_scale = layer._c8_k_scale.to(qdt)
                k_offset = layer._c8_k_offset.to(qdt)
                v_scale = layer._c8_v_scale.to(qdt)
                v_offset = layer._c8_v_offset.to(qdt)
                key = (key.to(qdt) - k_offset) * k_scale
                value = (value.to(qdt) - v_offset) * v_scale
        attn_output, _ = torch_npu.npu_fused_infer_attention_score(
            query=query,
            key=key,
            value=value,
            atten_mask=attn_metadata.attn_mask,
            block_table=block_table,
            input_layout="TND",
            block_size=block_size,
            actual_seq_lengths=actual_seq_qlen,
            actual_seq_lengths_kv=actual_seq_lengths_kv,
            num_key_value_heads=self.num_kv_heads,
            num_heads=self.num_heads,
            scale=self.scale,
            sparse_mode=3,
        )
        attn_output = attn_output.view(num_tokens, self.num_heads, self.head_size)
        output[:num_tokens] = attn_output
        return output
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -721,3 +721,27 @@
 #       override _get_deepstack_input_embeds method with the flash comm v1 implementation.
 #    Future Plan:
 #       Remove this patch when https://github.com/vllm-project/vllm-ascend/issues/5712 is completed.
 #
 # ** 29. File: worker/patch_qwen3_c8.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.qwen3.Qwen3ForCausalLM.load_weights`
 #    Why:
 #       The Qwen3 W8A8C8 model stores per-channel KV cache scales and offsets
 #       (k_cache_scale, k_cache_offset, v_cache_scale, v_cache_offset) under
 #       weight names that AutoWeightsLoader does not recognise and would
 #       silently discard.  Without these scales the INT8 KV cache cannot be
 #       dequantised correctly at inference time.
 #    How:
 #       Wrap load_weights to intercept the C8 scale/offset tensors before they
 #       reach the base loader.  Each intercepted tensor is routed to the
 #       corresponding nn.Parameter via its weight_loader, then excluded from
 #       the remaining weight stream so the base loader never sees it.
 #    Related PR (if no, explain why):
 #       This PR (Qwen3-32B W8A8C8 support).  Upstream vLLM's weight-loading
 #       pipeline does not yet have a generic hook for hardware-plugin-defined
 #       KV cache parameters.
 #    Future Plan:
 #       Remove this patch when vLLM provides a first-class extension point
 #       for loading extra KV cache quantisation parameters in model load_weights,
 #       or when the Qwen3 model's weight names are aligned with the parameter
 #       names expected by the quantisation backend.
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -51,3 +51,4 @@ import vllm_ascend.patch.worker.patch_v2.patch_model_state  # noqa
 import vllm_ascend.patch.worker.patch_v2.patch_block_table  # noqa
 import vllm_ascend.patch.worker.patch_qwen3vl  # noqa
 import vllm_ascend.patch.worker.patch_deepencoder2  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_c8  # noqa
--- a/vllm_ascend/patch/worker/patch_qwen3_c8.py
+++ b/vllm_ascend/patch/worker/patch_qwen3_c8.py
@@ -0,0 +1,54 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from collections.abc import Iterable
 import torch
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 _orig_qwen3_causal_lm_load_weights = Qwen3ForCausalLM.load_weights
 def _patched_qwen3_causal_lm_load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
    quant_config = self.quant_config
    if quant_config is None or not callable(getattr(quant_config, "get_cache_scale", None)):
        return _orig_qwen3_causal_lm_load_weights(self, weights)
    params_dict = dict(self.named_parameters())
    c8_loaded_params: set[str] = set()
    def _intercept_c8_scales(
        raw_weights: Iterable[tuple[str, torch.Tensor]],
    ) -> Iterable[tuple[str, torch.Tensor]]:
        for name, loaded_weight in raw_weights:
            scale_name = quant_config.get_cache_scale(name)
            if scale_name is not None:
                if scale_name in params_dict:
                    param = params_dict[scale_name]
                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
                    weight_loader(param, loaded_weight.squeeze())
                    c8_loaded_params.add(scale_name)
            else:
                yield name, loaded_weight
    loaded_params = _orig_qwen3_causal_lm_load_weights(self, _intercept_c8_scales(weights))
    loaded_params.update(c8_loaded_params)
    return loaded_params
 Qwen3ForCausalLM.load_weights = _patched_qwen3_causal_lm_load_weights
--- a/vllm_ascend/quantization/methods/kv_c8.py
+++ b/vllm_ascend/quantization/methods/kv_c8.py
@@ -2,11 +2,12 @@ import torch
 from vllm.config import get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
 from .base import AscendAttentionScheme
 from .registry import register_scheme
-def weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor):
+def _fa_quant_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor):
-    """fa_q weight loader."""
+    """Weight loader for MLA-based C8 (FAKQuant) models."""
    if param.numel() == 1 and loaded_weight.numel() == 1:
        param.data.fill_(loaded_weight.item())
    else:
@@ -50,7 +51,7 @@ class AscendFAQuantAttentionMethod:
            weight_param = torch.nn.Parameter(weight, requires_grad=False)
            module.register_parameter(weight_name, weight_param)
            # When loading weights, segment them according to TP
-            weight_param.weight_loader = weight_loader
+            weight_param.weight_loader = _fa_quant_weight_loader
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        fa_k_scale = torch.squeeze(layer.fa_k.scale).unsqueeze(0)
@@ -87,3 +88,60 @@ class AscendSFAQuantAttentionMethod:
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        pass
 def _c8_kv_scale_weight_loader(param: torch.nn.Parameter, loaded_weight: torch.Tensor) -> None:
    """Weight loader for dense-attention C8 KV cache scales/offsets."""
    loaded_weight = loaded_weight.squeeze()
    if param.data.shape != loaded_weight.shape:
        param.data = loaded_weight.to(param.dtype).clone()
    else:
        param.data.copy_(loaded_weight)
 class AscendC8KVCacheAttentionMethod(AscendAttentionScheme):
    """C8 INT8 KV cache quantization for dense-attention models (e.g. Qwen3)."""
    def __init__(self, quant_description: dict, prefix: str):
        self.quant_description = quant_description
        self.prefix = prefix
    def create_weights(self, layer: torch.nn.Module) -> None:
        # Override kv_cache_torch_dtype so Attention.get_kv_cache_spec returns int8 automatically.
        layer.kv_cache_torch_dtype = torch.int8
        # Upgrade impl to the C8-specific subclass so the C8 forward path is always used.
        if hasattr(layer, "impl"):
            from vllm_ascend.attention.attention_v1 import AscendC8AttentionBackendImpl
            layer.impl.__class__ = AscendC8AttentionBackendImpl
        layer.k_cache_scale = torch.nn.Parameter(torch.ones(1, dtype=torch.float32), requires_grad=False)
        layer.k_cache_scale.weight_loader = _c8_kv_scale_weight_loader
        layer.k_cache_offset = torch.nn.Parameter(torch.zeros(1, dtype=torch.float32), requires_grad=False)
        layer.k_cache_offset.weight_loader = _c8_kv_scale_weight_loader
        layer.v_cache_scale = torch.nn.Parameter(torch.ones(1, dtype=torch.float32), requires_grad=False)
        layer.v_cache_scale.weight_loader = _c8_kv_scale_weight_loader
        layer.v_cache_offset = torch.nn.Parameter(torch.zeros(1, dtype=torch.float32), requires_grad=False)
        layer.v_cache_offset.weight_loader = _c8_kv_scale_weight_loader
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        layer.k_cache_scale.data = layer.k_cache_scale.data.flatten()
        layer.k_cache_offset.data = layer.k_cache_offset.data.flatten()
        layer.v_cache_scale.data = layer.v_cache_scale.data.flatten()
        layer.v_cache_offset.data = layer.v_cache_offset.data.flatten()
    def apply(
        self,
        layer: torch.nn.Module,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        kv_cache,
        attn_metadata,
        attn_type,
        scale,
        output,
    ) -> torch.Tensor:
        raise RuntimeError(
            "AscendC8KVCacheAttentionMethod.apply should not be called. "
            "C8 KV cache quantization is handled by the attention backend."
        )
--- a/vllm_ascend/quantization/modelslim_config.py
+++ b/vllm_ascend/quantization/modelslim_config.py
@@ -429,6 +429,21 @@ class AscendModelSlimConfig(QuantizationConfig):
            self._add_kvcache_quant_metadata()
            logger.info("Applied hf_to_vllm_mapper to quant_description keys")
    def get_cache_scale(self, name: str) -> str | None:
        """Map checkpoint C8 KV scale/offset names to vLLM parameter names."""
        if self.quant_description.get("kv_cache_type") != "C8":
            return None
        _C8_SCALE_MAPPING = {
            "k_proj.kv_cache_scale": "attn.k_cache_scale",
            "k_proj.kv_cache_offset": "attn.k_cache_offset",
            "v_proj.kv_cache_scale": "attn.v_cache_scale",
            "v_proj.kv_cache_offset": "attn.v_cache_offset",
        }
        for src_suffix, dst_suffix in _C8_SCALE_MAPPING.items():
            if name.endswith(src_suffix):
                return name[: -len(src_suffix)] + dst_suffix
        return None
    def quant_prefix_mapper(self, model_type: str, prefix: str) -> str:
        self.model_type = model_type
        return prefix
@@ -476,6 +491,10 @@ class AscendModelSlimConfig(QuantizationConfig):
        ):
            scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping)
            return AscendKVCacheMethod(scheme)
        elif isinstance(layer, AttentionLayerBase) and self.quant_description.get("kv_cache_type") == "C8":
            from .methods.kv_c8 import AscendC8KVCacheAttentionMethod
            return AscendKVCacheMethod(AscendC8KVCacheAttentionMethod(self.quant_description, prefix))
        elif isinstance(layer, FusedMoE):
            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import