diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index d37f337b..628314a4 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -179,6 +179,7 @@ jobs:
           VLLM_USE_MODELSCOPE: True
         if: ${{ inputs.type == 'full' }}
         run: |
+          pytest -sv tests/e2e/multicard/test_quantization.py
           pytest -sv tests/e2e/multicard/test_aclgraph_capture_replay.py
           pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py
           pytest -sv tests/e2e/multicard/test_full_graph_mode.py
diff --git a/docs/source/user_guide/feature_guide/index.md b/docs/source/user_guide/feature_guide/index.md
index a8732b17..cda44f03 100644
--- a/docs/source/user_guide/feature_guide/index.md
+++ b/docs/source/user_guide/feature_guide/index.md
@@ -7,6 +7,7 @@ This section provides a detailed usage guide of vLLM Ascend features.
 :maxdepth: 1
 graph_mode
 quantization
+quantization-llm-compressor
 sleep_mode
 structured_output
 lora
diff --git a/docs/source/user_guide/feature_guide/quantization-llm-compressor.md b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
new file mode 100644
index 00000000..a97b4de2
--- /dev/null
+++ b/docs/source/user_guide/feature_guide/quantization-llm-compressor.md
@@ -0,0 +1,65 @@
+# llm-compressor Quantization Guide
+
+Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed.
+
+## Supported llm-compressor Quantization Types
+
+Support CompressedTensorsW8A8 static weight
+
+weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric.
+
+Support CompressedTensorsW8A8Dynamic weight
+
+weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic.
+
+## Install llm-compressor
+
+To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM.
+
+Install llm-compressor
+
+```bash
+pip install llmcompressor
+```
+
+### Generate the W8A8 weights
+
+```bash
+cd examples/quantization/llm-compressor
+
+python3 w8a8_int8_dynamic.py
+```
+
+for more details, see the [Official Sample](https://github.com/vllm-project/llm-compressor/tree/main/examples).
+
+## Run the model
+
+Now, you can run the quantized model with vLLM Ascend. Examples for online and offline inference are provided as follows:
+
+### Offline inference
+
+```python
+import torch
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40)
+
+llm = LLM(model="{quantized_model_save_path}",
+          max_model_len=2048,
+          trust_remote_code=True)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+### Online inference
+
+Start the quantized model using vLLM Ascend; no modifications to the startup command are required.
diff --git a/examples/quantization/llm-compressor/w8a8_int8.py b/examples/quantization/llm-compressor/w8a8_int8.py
new file mode 100644
index 00000000..9a6cb392
--- /dev/null
+++ b/examples/quantization/llm-compressor/w8a8_int8.py
@@ -0,0 +1,160 @@
+import os
+import torch
+
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \
+    AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy
+
+W8A8_W_cha_A_ten_static_symmetric = {
+    "group_0": QuantizationScheme(
+        targets=["Linear"],
+        weights=QuantizationArgs(
+            num_bits=8,
+            type=QuantizationType.INT,
+            strategy=QuantizationStrategy.CHANNEL,
+            symmetric=True,
+            dynamic=False
+        ),
+        input_activations=QuantizationArgs(
+            num_bits=8,
+            type=QuantizationType.INT,
+            strategy=QuantizationStrategy.TENSOR,
+            symmetric=True,
+            dynamic=False
+        ),
+    ),
+}
+
+# supported modifiers
+MODIFIER_DICT = {
+    "PTQ": QuantizationModifier,
+    "AWQ": AWQModifier,
+    "GPTQ": GPTQModifier,
+}
+
+# supported schemes
+SCHEMES_DICT = {
+    "W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric,
+}
+
+MODEL_DICT = {
+    "qwen3": AutoModelForCausalLM,
+}
+
+TOKENIZER_DICT = {
+    "qwen3": AutoTokenizer,
+}
+
+
+def load_environment_variables():
+    env_vars = {
+        'model_path': "Qwen/Qwen3-32B",
+        'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
+        'modifier': "GPTQ",
+        'schemes': "W8A8_W_cha_A_ten_static_symmetric",
+        'calib_prompt_path': "HuggingFaceH4/ultrachat_200k"
+    }
+
+    # verify export model path
+    if env_vars['export_path'] is None:
+        env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier']
+        if env_vars['schemes'] is not None:
+            env_vars['export_path'] += "-" + env_vars['schemes']
+    os.makedirs(env_vars['export_path'], exist_ok=True)
+
+    return env_vars
+
+
+def load_calibration_text_dataset(calib_prompt_path, tokenizer):
+    # Load dataset
+    for f in os.listdir(calib_prompt_path):
+        print(f)
+    if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)):
+        ds = load_dataset('json', data_dir=calib_prompt_path, split='validation')
+    elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)):
+        ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]")
+    else:
+        raise ValueError("Unsupported calibration file format: {}".format(
+            calib_prompt_path.split('.')[-1]))
+
+    # Preprocess dataset
+    def preprocess(example):
+        if tokenizer.chat_template is not None:
+            return {"text": tokenizer.apply_chat_template(
+                example["messages"], tokenize=False)}
+        else:
+            return {"text": example["messages"]}
+
+    # Tokenize inputs
+    def tokenize(sample):
+        return tokenizer(
+            sample["text"],
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(preprocess)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    return ds
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long)
+        for key, value in batch[0].items()
+    }
+
+
+def quantize_model(model, env_vars, dataset_dict=None):
+    # since the MoE gate layers are sensitive to quantization, we add them to the ignore
+    # list so they remain at full precision
+    ignore = ["lm_head", "re:.*mlp.down_proj"]
+
+    # define a llmcompressor recipe
+    recipe = [
+        MODIFIER_DICT[env_vars['modifier']](
+            config_groups=SCHEMES_DICT[env_vars['schemes']],
+            ignore=ignore,
+        ),
+    ]
+
+    # quantize the model
+    oneshot(
+        model=model,
+        dataset=dataset_dict,
+        recipe=recipe,
+        trust_remote_code_model=True,
+    )
+
+
+def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
+    model.save_pretrained(save_path, save_compressed=save_compressed)
+    tokenizer.save_pretrained(save_path)
+
+
+if __name__ == '__main__':
+    # get environment variables
+    env_vars = load_environment_variables()
+
+    # support model type list
+    config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True)
+    model_type = config.model_type
+
+    model = MODEL_DICT[model_type].from_pretrained(
+        env_vars['model_path'], torch_dtype="auto", trust_remote_code=True
+    )
+    tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)
+
+    ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
+
+    # Quantize the model
+    quantize_model(model, env_vars, ds)
+
+    # save the quantized model
+    save_quantized_model(model, tokenizer, env_vars['export_path'], True)
\ No newline at end of file
diff --git a/examples/quantization/llm-compressor/w8a8_int8_dynamic.py b/examples/quantization/llm-compressor/w8a8_int8_dynamic.py
new file mode 100644
index 00000000..1cc9d21c
--- /dev/null
+++ b/examples/quantization/llm-compressor/w8a8_int8_dynamic.py
@@ -0,0 +1,83 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure algorithms. In this case, we:
+#   * apply SmoothQuant to make the activations easier to quantize
+#   * quantize the weights to int8 with GPTQ (static per channel)
+#   * quantize the activations to int8 (dynamic per token)
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# Apply algorithms and save to output_dir
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
\ No newline at end of file
diff --git a/mypy.ini b/mypy.ini
index 38881158..7778a6f1 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -15,6 +15,15 @@ ignore_missing_imports = True
 [mypy-lm_eval.*]
 ignore_missing_imports = True
 
+[mypy-compressed_tensors.*]
+ignore_missing_imports = True
+
+[mypy-datasets.*]
+ignore_missing_imports = True
+
+[mypy-llmcompressor.*]
+ignore_missing_imports = True
+
 [mypy-msprobe.*]
 ignore_missing_imports = True
-allow_untyped_imports = True
\ No newline at end of file
+allow_untyped_imports = True
diff --git a/pyproject.toml b/pyproject.toml
index 1fa9d15f..7a97edc3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ requires = [
     "quart",
     "numba",
     "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm
+    "compressed_tensors>=0.11.0"
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 41a14390..2a176f84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,7 @@ torchvision
 wheel
 pandas-stubs
 opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm
+compressed_tensors>=0.11.0
 
 # requirements for disaggregated prefill
 msgpack
diff --git a/tests/e2e/multicard/test_quantization.py b/tests/e2e/multicard/test_quantization.py
new file mode 100644
index 00000000..67c57daf
--- /dev/null
+++ b/tests/e2e/multicard/test_quantization.py
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/e2e/multicard/test_quantization.py`.
+"""
+from modelscope import snapshot_download  # type: ignore
+
+from tests.e2e.conftest import VllmRunner
+
+
+def test_models_distributed_quantized_W8A8():
+    example_prompts = [
+        "The president of the United States is",
+    ]
+    max_tokens = 5
+    with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
+                    tensor_parallel_size=2,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.8,
+                    enforce_eager=False) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    golden_results = [
+        'The president of the United States is the head of state and',
+    ]
+
+    for i in range(len(vllm_output)):
+        assert golden_results[i] == vllm_output[i][1]
+        print(f"Generated text: {vllm_output[i][1]!r}")
diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py
index 4622692d..b667767b 100644
--- a/tests/ut/quantization/test_quant_config.py
+++ b/tests/ut/quantization/test_quant_config.py
@@ -65,7 +65,7 @@ class TestAscendQuantConfig(TestBase):
         # Test when NPU is available
         mock_is_available.return_value = True
         result = AscendQuantConfig.override_quantization_method(None, None)
-        self.assertEqual(result, ASCEND_QUANTIZATION_METHOD)
+        self.assertIsNone(result)
 
         # Test when NPU is not available
         mock_is_available.return_value = False
@@ -93,7 +93,7 @@ class TestAscendQuantConfig(TestBase):
             self.assertIs(method, mock_ascend_linear.return_value)
             mock_ascend_linear.assert_called_once_with(
                 self.ascend_config, ".attn",
-                self.ascend_config.packed_modules_mapping)
+                self.ascend_config.packed_modules_mapping, linear_layer)
 
     def test_get_quant_method_for_attention(self):
         attention_layer = MagicMock(spec=Attention)
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index 798cf14a..5fe5cde3 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -9,7 +9,8 @@ from vllm.platforms import PlatformEnum
 
 from tests.ut.base import TestBase
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, AscendDeviceType
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
+                               COMPRESSED_TENSORS_METHOD, AscendDeviceType)
 
 
 class TestNPUPlatform(TestBase):
@@ -47,8 +48,9 @@ class TestNPUPlatform(TestBase):
         self.assertEqual(NPUPlatform.device_control_env_var,
                          "ASCEND_RT_VISIBLE_DEVICES")
         self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1")
-        self.assertEqual(NPUPlatform.supported_quantization,
-                         [ASCEND_QUANTIZATION_METHOD])
+        self.assertEqual(
+            NPUPlatform.supported_quantization,
+            [ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD])
 
     def test_is_sleep_mode_available(self):
         self.assertTrue(self.platform.is_sleep_mode_available())
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 0797da32..9e8b2593 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -30,12 +30,13 @@ from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
                                        init_ascend_config)
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, AscendDeviceType,
-                               enable_sp, get_ascend_device_type, is_vl_model,
-                               prefill_context_parallel_enable,
-                               update_aclgraph_sizes,
-                               update_cudagraph_capture_sizes,
-                               update_default_aclgraph_sizes)
+
+# isort: off
+from vllm_ascend.utils import (
+    ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType,
+    enable_sp, get_ascend_device_type, is_vl_model,
+    prefill_context_parallel_enable, update_aclgraph_sizes,
+    update_cudagraph_capture_sizes, update_default_aclgraph_sizes)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -56,7 +57,9 @@ class NPUPlatform(Platform):
     device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
     dispatch_key: str = "PrivateUse1"
 
-    supported_quantization: list[str] = [ASCEND_QUANTIZATION_METHOD]
+    supported_quantization: list[str] = [
+        ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD
+    ]
 
     def is_sleep_mode_available(self) -> bool:
         return True
@@ -79,6 +82,8 @@ class NPUPlatform(Platform):
                 if ASCEND_QUANTIZATION_METHOD not in quant_action.choices:
                     quant_action.choices.append(ASCEND_QUANTIZATION_METHOD)
 
+        from vllm_ascend.quantization.compressed_tensors.compressed_tensors import \
+            AscendCompressedTensorsConfig  # noqa: F401
         from vllm_ascend.quantization.quant_config import \
             AscendQuantConfig  # noqa: F401
 
diff --git a/vllm_ascend/quantization/compressed_tensors/__init__.py b/vllm_ascend/quantization/compressed_tensors/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
new file mode 100644
index 00000000..f95ff7f0
--- /dev/null
+++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,252 @@
+from typing import TYPE_CHECKING, Any, Optional, cast
+
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import (
+    QUANTIZATION_METHODS, register_quantization_config)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import \
+    CompressedTensorsScheme
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target, is_activation_quantization_format,
+    should_ignore_layer)
+
+from vllm_ascend.quantization.quant_config import (AscendLinearMethod,
+                                                   AscendQuantConfig)
+from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
+from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
+from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
+logger = init_logger(__name__)
+
+QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]]
+
+
+def remove_quantization_method():
+    if COMPRESSED_TENSORS_METHOD in QUANTIZATION_METHODS:
+        QUANTIZATION_METHODS.remove(COMPRESSED_TENSORS_METHOD)
+
+
+remove_quantization_method()
+
+
+@register_quantization_config(COMPRESSED_TENSORS_METHOD)
+class AscendCompressedTensorsConfig(QuantizationConfig):
+
+    def __init__(
+        self,
+        target_scheme_map: dict[str, Any],
+        ignore: list[str],
+        quant_format: str,
+        config: Optional[dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.ignore = ignore
+        self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
+        self.quant_description = config
+
+    def get_name(self) -> str:
+        return "compressed-tensors"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.int8, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "Ascend hardware dose not support \"get_min_capability\" feature.")
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str,
+                                      Any]) -> "AscendCompressedTensorsConfig":
+        ignore: list[str] = cast(list[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            config=config,
+        )
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+            cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
+        target_scheme_map: dict[str, Any] = dict()
+        quant_format = cast(str, config.get("format"))
+
+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
+            targets = quant_config.get("targets")
+            for target in targets:
+                target_scheme_map[target] = {}
+                target_scheme_map[target][
+                    "weights"] = QuantizationArgs.model_validate(
+                        quant_config.get("weights"))
+
+                target_scheme_map[target]["input_activations"] = None
+                target_scheme_map[target]["format"] = quant_config.get(
+                    "format")
+                format = target_scheme_map[target].get("format")
+                # If no per-config format defined, use global format in config
+                act_quant_format = (
+                    is_activation_quantization_format(format)
+                    if format is not None else
+                    is_activation_quantization_format(quant_format))
+                input_activations = quant_config.get("input_activations")
+                if act_quant_format and input_activations is not None:
+                    target_scheme_map[target]["input_activations"] = (
+                        QuantizationArgs.model_validate(
+                            quant_config.get("input_activations")))
+        return target_scheme_map
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
+            # collect schemes
+            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+
+            # choose quantization method
+            quant_method: LinearMethodBase = UnquantizedLinearMethod()
+            if quant_scheme is not None:
+                layer.scheme = quant_scheme
+                ascend_quant_config = AscendQuantConfig(self.quant_description
+                                                        or {})
+                quant_method = AscendLinearMethod(ascend_quant_config, prefix,
+                                                  None, layer)
+            return quant_method
+        return None
+
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        Detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for inference.
+        """
+
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to.
+        if should_ignore_layer(layer_name,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
+            return None
+
+        # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+
+        if weight_quant is None:
+            logger.warning_once("Acceleration for non-quantized schemes is "
+                                "not supported by Compressed Tensors. "
+                                "Falling back to UnquantizedLinearMethod")
+            return None
+
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
+        return scheme
+
+    def _get_scheme_from_parts(
+            self, weight_quant: QuantizationArgs,
+            input_quant: QuantizationArgs) -> "CompressedTensorsScheme":
+        act_quant_format = is_activation_quantization_format(self.quant_format)
+        if act_quant_format and input_quant is not None:
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                return AscendW8A8LinearMethod()
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                return AscendW8A8DynamicLinearMethod()
+
+        raise NotImplementedError(
+            "No compressed-tensors compatible scheme was found.")
+
+    def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_tensor = (weight_strategy and input_quant.strategy
+                     == QuantizationStrategy.TENSOR.value)
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        # Only symmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_tensor and is_symmetric and is_static
+
+    def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        # Only symmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_token and is_symmetric and is_dynamic
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
+            self.target_scheme_map)
+        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index d6696304..72c04e50 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -94,8 +94,10 @@ class AscendQuantConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
-        if torch.npu.is_available():
-            return ASCEND_QUANTIZATION_METHOD
+        if hf_quant_cfg is not None:
+            quant_method = hf_quant_cfg.get("quant_method", None)
+            if quant_method is None and torch.npu.is_available():
+                return ASCEND_QUANTIZATION_METHOD
         return None
 
     def get_quant_method(self, layer: torch.nn.Module,
@@ -113,7 +115,7 @@ class AscendQuantConfig(QuantizationConfig):
                                             self.packed_modules_mapping):
                 return AscendUnquantizedLinearMethod()
             return AscendLinearMethod(self, prefix,
-                                      self.packed_modules_mapping)
+                                      self.packed_modules_mapping, layer)
         elif isinstance(layer, Attention) and \
             'fa_quant_type' in self.quant_description.keys() and \
             self.quant_description['fa_quant_type'] is not None:
@@ -126,13 +128,13 @@ class AscendQuantConfig(QuantizationConfig):
                                             self.packed_modules_mapping):
                 return AscendUnquantizedFusedMoEMethod(layer.moe_config)
             return AscendFusedMoEMethod(self, prefix,
-                                        self.packed_modules_mapping)
+                                        self.packed_modules_mapping, layer)
         elif isinstance(layer, VocabParallelEmbedding):
             if self.is_layer_skipped_ascend(prefix,
                                             self.packed_modules_mapping):
                 return UnquantizedEmbeddingMethod()
             return AscendEmbeddingMethod(self, prefix,
-                                         self.packed_modules_mapping)
+                                         self.packed_modules_mapping, layer)
         return None
 
     def is_layer_skipped_ascend(
@@ -259,11 +261,16 @@ class AscendLinearMethod(LinearMethodBase):
         quant_config: The Ascend quantization config.
     """
 
-    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any]) -> None:
+    def __init__(self,
+                 quant_config: AscendQuantConfig,
+                 prefix: str,
+                 packed_modules_mapping: Dict[str, Any] | None,
+                 layer: torch.nn.Module = None) -> None:
         self.quant_method = get_quant_method(quant_config.quant_description,
-                                             prefix, "linear",
-                                             packed_modules_mapping)
+                                             prefix,
+                                             "linear",
+                                             packed_modules_mapping,
+                                             layer=layer)
 
     def create_weights(
         self,
@@ -401,11 +408,16 @@ class AscendFusedMoEMethod(FusedMoEMethodBase):
         quant_config: The Ascend quantization config.
     """
 
-    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any]):
+    def __init__(self,
+                 quant_config: AscendQuantConfig,
+                 prefix: str,
+                 packed_modules_mapping: Dict[str, Any],
+                 layer: torch.nn.Module = None):
         self.quant_method = get_quant_method(quant_config.quant_description,
-                                             prefix, "moe",
-                                             packed_modules_mapping)
+                                             prefix,
+                                             "moe",
+                                             packed_modules_mapping,
+                                             layer=layer)
 
     def create_weights(
         self,
@@ -485,7 +497,10 @@ class AscendEmbeddingMethod(AscendLinearMethod):
     """
 
     def __init__(self, quant_config: AscendQuantConfig, prefix: str,
-                 packed_modules_mapping: Dict[str, Any]) -> None:
+                 packed_modules_mapping: Dict[str, Any],
+                 layer: torch.nn.Module) -> None:
         self.quant_method = get_quant_method(quant_config.quant_description,
-                                             prefix, "linear",
-                                             packed_modules_mapping)
+                                             prefix,
+                                             "linear",
+                                             packed_modules_mapping,
+                                             layer=layer)
diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py
index 6d914c0d..eaaaee86 100644
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -1,7 +1,10 @@
 from typing import Any, Dict, Optional, Type
 
+import torch
 from vllm.logger import logger
 
+from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
+
 from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
 from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
                            AscendW4A8DynamicLinearMethod)
@@ -60,8 +63,28 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
 def get_quant_method(quant_description: Dict[str, Any],
                      prefix: str,
                      layer_type: str,
-                     packed_modules_mapping: Optional[Dict[str, Any]] = None):
-    logger.info_once("Using the vLLM Ascend Quantization now!")
+                     packed_modules_mapping: Optional[Dict[str, Any]] = None,
+                     layer: torch.nn.Module = None):
+    if quant_description.get("quant_method") == COMPRESSED_TENSORS_METHOD:
+        return get_quant_method_llmcompressor(layer)
+
+    return get_quant_method_modelslim(quant_description, prefix, layer_type,
+                                      packed_modules_mapping)
+
+
+def get_quant_method_llmcompressor(layer: torch.nn.Module):
+    logger.info_once("Using the vLLM Ascend llmcompressor Quantization now!")
+    if layer.scheme is None:
+        raise ValueError("A scheme must be defined for each layer")
+    return layer.scheme
+
+
+def get_quant_method_modelslim(
+        quant_description: Dict[str, Any],
+        prefix: str,
+        layer_type: str,
+        packed_modules_mapping: Optional[Dict[str, Any]] = None):
+    logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
     if packed_modules_mapping is None:
         packed_modules_mapping = dict()
     # Attention
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py
index c4f8fb04..8a7bbfe7 100644
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -25,7 +25,8 @@ from vllm.forward_context import get_forward_context
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
+                               COMPRESSED_TENSORS_METHOD, AscendDeviceType,
                                get_ascend_device_type, is_enable_nz)
 
 
@@ -149,6 +150,10 @@ class AscendW8A8LinearMethod:
                 )
 
         quant_bias = layer.quant_bias if tp_rank == 0 else None
+        if getattr(layer, "ascend_quant_method",
+                   "") == COMPRESSED_TENSORS_METHOD:
+            quant_bias = bias
+
         if get_ascend_device_type() == AscendDeviceType._310P:
             # On 300I Duo platform, we need transpose again if
             # using nz. This transpose can be skipped in torchair.
@@ -187,6 +192,11 @@ class AscendW8A8LinearMethod:
                 layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
         layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
         layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
+        if getattr(layer, "ascend_quant_method",
+                   "") == COMPRESSED_TENSORS_METHOD:
+            deq_scale = layer.input_scale.data * layer.weight_scale.data
+            layer.deq_scale = torch.nn.Parameter(deq_scale,
+                                                 requires_grad=False)
 
 
 class AscendW8A8FusedMoEMethod:
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index efb1d5f5..0a74bcbf 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -41,6 +41,7 @@ else:
     VllmConfig = None
 
 ASCEND_QUANTIZATION_METHOD = "ascend"
+COMPRESSED_TENSORS_METHOD = "compressed-tensors"
 SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
 REGISTERED_ASCEND_OPS = {}