[v0.18.0]feat(quant): add C8 INT8 KV cache support for GQA attention models (#7474) (#8007)

backport of #7474 This PR adds C8 (INT8) KV cache quantization support for standard GQA attention models (e.g., Qwen3-32B W8A8C8). C8 uses static per-channel quantization scales to store KV cache in INT8, reducing KV cache memory by ~50% compared to BF16, enabling higher batch concurrency and longer context lengths on the same hardware. **Key changes:** 1. **`attention_v1.py`** — New `AscendC8AttentionBackendImpl` subclass of `AscendAttentionBackendImpl`: - `_prepare_c8_scales`: Shards per-channel scales/offsets to the current TP rank and pre-computes BF16 BNSD-shaped antiquant tensors (one-time per layer). - `_quantize_kv_to_int8`: Quantizes BF16 K/V to INT8 before `reshape_and_cache`, using pre-cached inverse scales. - `_forward_c8_decode`: FIA V1 BNSD paged attention with native INT8 KV and `perchannel` antiquant mode. - `_forward_c8_chunked_prefill`: Splits decode (FIA V1 BNSD paged INT8) and prefill (FIA V1 TND float) into two kernel calls. - `_forward_c8_fused_infer_attention`: Handles `PrefillNoCache` and `PrefillCacheHit` states. 2. **`quantization/methods/kv_c8.py`** — New `AscendC8KVCacheAttentionMethod` scheme: - Creates `k/v_cache_scale/offset` parameters via `_c8_kv_scale_weight_loader`, which handles per-channel scale shapes and lazy resizing. - Sets `layer.kv_cache_torch_dtype = torch.int8` so `get_kv_cache_spec()` returns INT8 dtype automatically. - Upgrades `layer.impl` to `AscendC8AttentionBackendImpl` via class surgery. 3. **`quantization/modelslim_config.py`** — C8 branch in `get_quant_method()` activates when `kv_cache_type == "C8"` in `quant_model_description.json`. 4. **`patch/worker/patch_qwen3_c8.py`** — Intercepts per-channel C8 scale/offset weights before `AutoWeightsLoader` discards them, routing them to the parameters created by `AscendC8KVCacheAttentionMethod`. 5. **`tests/ut/quantization/test_kv_c8.py`** — Unit tests covering `_c8_kv_scale_weight_loader`, `AscendC8KVCacheAttentionMethod`, and `AscendC8AttentionBackendImpl` scale helpers. Yes. Users can now serve Qwen3-32B W8A8C8 quantized models with INT8 KV cache on Ascend NPU. The model checkpoint must contain a `quant_model_description.json` with `"kv_cache_type": "C8"` and per-channel scale/offset tensors in safetensors. No changes to the serving CLI — the feature activates automatically when the quantization config is detected. Benchmarked with `vllm serve` (TP=8, `max_num_seqs=256`, `max_model_len=131072`, `enable_chunked_prefill=true`) + `random_bench` (input_len=10240, output_len=2048, 960 prompts, max_concurrency=192): ``` ============ Serving Benchmark Result ============ Successful requests: 960 Failed requests: 0 Maximum request concurrency: 192 Benchmark duration (s): 1359.81 Total input tokens: 9830400 Total generated tokens: 1966080 Request throughput (req/s): 0.71 Output token throughput (tok/s): 1445.85 Peak output token throughput (tok/s): 2304.00 Total token throughput (tok/s): 8675.12 ---------------Time to First Token---------------- Mean TTFT (ms): 24598.51 Median TTFT (ms): 23167.02 P50 TTFT (ms): 23167.02 P90 TTFT (ms): 47717.08 P99 TTFT (ms): 84402.61 -----Time per Output Token (excl. 1st token)------ Mean TPOT (ms): 120.76 Median TPOT (ms): 121.50 P50 TPOT (ms): 121.50 P90 TPOT (ms): 127.05 P99 TPOT (ms): 130.13 ---------------Inter-token Latency---------------- Mean ITL (ms): 120.70 Median ITL (ms): 90.34 P50 ITL (ms): 90.34 P90 ITL (ms): 93.79 P99 ITL (ms): 101.80 ================================================== ``` All attention states verified: `PrefillNoCache`, `PrefillCacheHit`, `ChunkedPrefill`, `DecodeOnly`. - vLLM version: v0.17.0 - vLLM main: 8b6325758c Signed-off-by: lico67373 <918688502@qq.com> Co-authored-by: LICO67373 <110013619+LICO1314@users.noreply.github.com>
2026-04-08 10:51:58 +08:00
parent fbd5d0fd55
commit 044d4c3974
8 changed files with 761 additions and 8 deletions
--- a/tests/ut/quantization/test_kv_c8.py
+++ b/tests/ut/quantization/test_kv_c8.py
@@ -1,7 +1,9 @@
 import unittest
 import torch
 import torch.nn as nn
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
+
+from tests.ut.base import TestBase


 class TestWeightLoader(unittest.TestCase):
@@ -10,7 +12,7 @@ class TestWeightLoader(unittest.TestCase):
    def setUp(self):
        """Set up test environment before each test"""
        # Import the module under test
-        from vllm_ascend.quantization.methods.kv_c8 import weight_loader
+        from vllm_ascend.quantization.methods.kv_c8 import _fa_quant_weight_loader as weight_loader
        self.weight_loader = weight_loader

        # Mock distributed functions
@@ -295,7 +297,7 @@ class TestAscendFAQuantAttentionMethodCreateWeights(unittest.TestCase):
            method.create_weights(self.layer)

            # Import weight_loader for comparison
-            from vllm_ascend.quantization.methods.kv_c8 import weight_loader
+            from vllm_ascend.quantization.methods.kv_c8 import _fa_quant_weight_loader as weight_loader

            # Verify each parameter exists and has weight_loader
            self.assertTrue(hasattr(self.layer.fa_q, "scale"))
@@ -440,7 +442,7 @@ class TestIntegration(unittest.TestCase):
        v_offset = torch.randint(-128, 127, (1, 1), dtype=torch.int8)

        # Load weights using weight_loader
-        from vllm_ascend.quantization.methods.kv_c8 import weight_loader
+        from vllm_ascend.quantization.methods.kv_c8 import _fa_quant_weight_loader as weight_loader

        with torch.no_grad():
            weight_loader(layer.fa_q.scale, q_scale)
@@ -464,5 +466,224 @@ class TestIntegration(unittest.TestCase):
        self.assertTrue(hasattr(layer, "quant_kscale"))


+class TestC8KVScaleWeightLoader(TestBase):
+    """Tests for _c8_kv_scale_weight_loader in kv_c8.py."""
+
+    def setUp(self):
+        from vllm_ascend.quantization.methods.kv_c8 import _c8_kv_scale_weight_loader
+        self.loader = _c8_kv_scale_weight_loader
+
+    def test_shape_match_copies_value(self):
+        param = nn.Parameter(torch.ones(4, dtype=torch.float32), requires_grad=False)
+        loaded = torch.tensor([1.0, 2.0, 3.0, 4.0])
+        self.loader(param, loaded)
+        self.assertTrue(torch.allclose(param.data, loaded.float()))
+
+    def test_shape_mismatch_resizes_param(self):
+        param = nn.Parameter(torch.ones(1, dtype=torch.float32), requires_grad=False)
+        loaded = torch.arange(8, dtype=torch.float32)
+        self.loader(param, loaded)
+        self.assertEqual(param.data.shape, (8,))
+        self.assertTrue(torch.allclose(param.data, loaded))
+
+    def test_squeeze_before_compare(self):
+        param = nn.Parameter(torch.ones(4, dtype=torch.float32), requires_grad=False)
+        loaded = torch.arange(4, dtype=torch.float32).unsqueeze(0)  # shape [1, 4]
+        self.loader(param, loaded)
+        self.assertEqual(param.data.shape, (4,))
+
+    def test_dtype_preserved_as_param_dtype(self):
+        param = nn.Parameter(torch.ones(4, dtype=torch.float32), requires_grad=False)
+        loaded = torch.arange(4, dtype=torch.float16)
+        self.loader(param, loaded)
+        self.assertEqual(param.data.dtype, torch.float32)
+
+
+class TestAscendC8KVCacheAttentionMethod(TestBase):
+    """Tests for AscendC8KVCacheAttentionMethod in kv_c8.py."""
+
+    def _make_method(self):
+        from vllm_ascend.quantization.methods.kv_c8 import AscendC8KVCacheAttentionMethod
+        return AscendC8KVCacheAttentionMethod(quant_description={}, prefix="model.layers.0.self_attn.attn")
+
+    def _make_layer_with_impl(self):
+        layer = nn.Module()
+        layer.impl = MagicMock()
+        return layer
+
+    def test_create_weights_sets_kv_cache_torch_dtype(self):
+        method = self._make_method()
+        layer = self._make_layer_with_impl()
+        method.create_weights(layer)
+        self.assertEqual(layer.kv_cache_torch_dtype, torch.int8)
+
+    def test_create_weights_registers_scale_offset_params(self):
+        method = self._make_method()
+        layer = self._make_layer_with_impl()
+        method.create_weights(layer)
+        self.assertIsInstance(layer.k_cache_scale, nn.Parameter)
+        self.assertIsInstance(layer.k_cache_offset, nn.Parameter)
+        self.assertIsInstance(layer.v_cache_scale, nn.Parameter)
+        self.assertIsInstance(layer.v_cache_offset, nn.Parameter)
+        self.assertFalse(layer.k_cache_scale.requires_grad)
+        self.assertFalse(layer.v_cache_offset.requires_grad)
+
+    def test_create_weights_initial_values(self):
+        method = self._make_method()
+        layer = self._make_layer_with_impl()
+        method.create_weights(layer)
+        self.assertEqual(layer.k_cache_scale.data.item(), 1.0)
+        self.assertEqual(layer.v_cache_scale.data.item(), 1.0)
+        self.assertEqual(layer.k_cache_offset.data.item(), 0.0)
+        self.assertEqual(layer.v_cache_offset.data.item(), 0.0)
+
+    def test_create_weights_assigns_weight_loader(self):
+        from vllm_ascend.quantization.methods.kv_c8 import _c8_kv_scale_weight_loader
+        method = self._make_method()
+        layer = self._make_layer_with_impl()
+        method.create_weights(layer)
+        self.assertIs(layer.k_cache_scale.weight_loader, _c8_kv_scale_weight_loader)
+        self.assertIs(layer.v_cache_scale.weight_loader, _c8_kv_scale_weight_loader)
+        self.assertIs(layer.k_cache_offset.weight_loader, _c8_kv_scale_weight_loader)
+        self.assertIs(layer.v_cache_offset.weight_loader, _c8_kv_scale_weight_loader)
+
+    def test_process_weights_after_loading_flattens(self):
+        method = self._make_method()
+        layer = nn.Module()
+        layer.k_cache_scale = nn.Parameter(torch.ones(2, 4), requires_grad=False)
+        layer.k_cache_offset = nn.Parameter(torch.zeros(2, 4), requires_grad=False)
+        layer.v_cache_scale = nn.Parameter(torch.ones(2, 4), requires_grad=False)
+        layer.v_cache_offset = nn.Parameter(torch.zeros(2, 4), requires_grad=False)
+        method.process_weights_after_loading(layer)
+        self.assertEqual(layer.k_cache_scale.data.dim(), 1)
+        self.assertEqual(layer.k_cache_scale.data.shape[0], 8)
+        self.assertEqual(layer.v_cache_offset.data.dim(), 1)
+
+    def test_apply_raises_runtime_error(self):
+        method = self._make_method()
+        layer = MagicMock()
+        with self.assertRaises(RuntimeError):
+            method.apply(layer, MagicMock(), MagicMock(), MagicMock(), None, None, None, None, None)
+
+
+class TestAscendC8AttentionBackendImplScales(TestBase):
+    """Tests for AscendC8AttentionBackendImpl scale helpers."""
+
+    def _make_impl(self, num_kv_heads=4, head_size=8):
+        from vllm_ascend.attention.attention_v1 import AscendC8AttentionBackendImpl
+        impl = object.__new__(AscendC8AttentionBackendImpl)
+        impl.num_heads = num_kv_heads
+        impl.num_kv_heads = num_kv_heads
+        impl.head_size = head_size
+        impl.scale = 1.0
+        impl.key_cache = None
+        impl.value_cache = None
+        return impl
+
+    def _make_layer(self, num_kv_heads=4, head_size=8):
+        layer = nn.Module()
+        layer.k_cache_scale = nn.Parameter(
+            torch.ones(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
+        )
+        layer.k_cache_offset = nn.Parameter(
+            torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
+        )
+        layer.v_cache_scale = nn.Parameter(
+            torch.ones(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
+        )
+        layer.v_cache_offset = nn.Parameter(
+            torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False
+        )
+        return layer
+
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
+    def test_prepare_c8_scales_runs_once(self, mock_tp_size, mock_tp_rank):
+        impl = self._make_impl()
+        layer = self._make_layer()
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+        self.assertTrue(hasattr(layer, "_c8_scales_prepared"))
+        self.assertTrue(layer._c8_scales_prepared)
+
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
+    def test_prepare_c8_scales_idempotent(self, mock_tp_size, mock_tp_rank):
+        impl = self._make_impl()
+        layer = self._make_layer()
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+        k_scale_after_first = layer._c8_k_scale.clone()
+        layer.k_cache_scale.data = torch.ones(32, dtype=torch.float32) * 99
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+        self.assertTrue(torch.allclose(layer._c8_k_scale, k_scale_after_first))
+
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
+    def test_prepare_c8_scales_creates_bnsd_shape(self, mock_tp_size, mock_tp_rank):
+        num_kv_heads, head_size = 4, 8
+        impl = self._make_impl(num_kv_heads, head_size)
+        layer = self._make_layer(num_kv_heads, head_size)
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+        self.assertEqual(layer._c8_k_aq_scale.shape, (1, num_kv_heads, 1, head_size))
+        self.assertEqual(layer._c8_v_aq_scale.shape, (1, num_kv_heads, 1, head_size))
+        self.assertEqual(layer._c8_k_aq_scale.dtype, torch.bfloat16)
+
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
+    def test_quantize_kv_to_int8_output_dtype(self, mock_tp_size, mock_tp_rank):
+        num_kv_heads, head_size = 4, 8
+        impl = self._make_impl(num_kv_heads, head_size)
+        layer = self._make_layer(num_kv_heads, head_size)
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+        num_tokens = 6
+        key = torch.zeros(num_tokens, num_kv_heads, head_size, dtype=torch.bfloat16)
+        value = torch.zeros(num_tokens, num_kv_heads, head_size, dtype=torch.bfloat16)
+        key_q, value_q = impl._quantize_kv_to_int8(key, value, layer, num_tokens)
+        self.assertEqual(key_q.dtype, torch.int8)
+        self.assertEqual(value_q.dtype, torch.int8)
+        self.assertEqual(key_q.shape, key.shape)
+
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
+    def test_quantize_kv_to_int8_formula(self, mock_tp_size, mock_tp_rank):
+        """With scale=2.0, offset=0: q = round(x / 2)."""
+        num_kv_heads, head_size = 1, 4
+        impl = self._make_impl(num_kv_heads, head_size)
+        layer = nn.Module()
+        scale_val = torch.full((num_kv_heads * head_size,), 2.0, dtype=torch.float32)
+        layer.k_cache_scale = nn.Parameter(scale_val.clone(), requires_grad=False)
+        layer.k_cache_offset = nn.Parameter(torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False)
+        layer.v_cache_scale = nn.Parameter(scale_val.clone(), requires_grad=False)
+        layer.v_cache_offset = nn.Parameter(torch.zeros(num_kv_heads * head_size, dtype=torch.float32), requires_grad=False)
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+        key = torch.full((1, num_kv_heads, head_size), 4.0, dtype=torch.bfloat16)
+        value = torch.full((1, num_kv_heads, head_size), 4.0, dtype=torch.bfloat16)
+        key_q, _ = impl._quantize_kv_to_int8(key, value, layer, 1)
+        self.assertTrue(torch.all(key_q[0] == 2))
+
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_rank", return_value=0)
+    @patch("vllm_ascend.attention.attention_v1.get_tensor_model_parallel_world_size", return_value=1)
+    def test_dequant_paged_kv_to_dense_round_trip(self, mock_tp_size, mock_tp_rank):
+        """With scale=1, offset=0: dequant(int8) == float(int8)."""
+        num_kv_heads, head_size = 2, 4
+        block_size = 32
+        num_blocks = 2
+        H = num_kv_heads * head_size
+        impl = self._make_impl(num_kv_heads, head_size)
+        layer = self._make_layer(num_kv_heads, head_size)
+        impl._prepare_c8_scales(layer, torch.device("cpu"))
+
+        key_int8 = torch.randint(-10, 10, (num_blocks, block_size, H), dtype=torch.int8)
+        value_int8 = torch.randint(-10, 10, (num_blocks, block_size, H), dtype=torch.int8)
+        seq_lens = [32, 32]
+        block_table = torch.tensor([[0], [1]], dtype=torch.long)
+
+        dense_k, dense_v = impl._dequant_paged_kv_to_dense(
+            key_int8, value_int8, block_table, seq_lens, torch.float32, layer
+        )
+        expected_k = key_int8.view(-1, num_kv_heads, head_size).float()
+        self.assertEqual(dense_k.shape, (64, num_kv_heads, head_size))
+        self.assertTrue(torch.allclose(dense_k, expected_k))
+
+
 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    unittest.main(verbosity=2)