diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 245fa85d..17fb04ff 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -215,6 +215,7 @@ jobs:
         if: ${{ inputs.type == 'light' }}
         run: |
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep
 
       - name: Run vllm-project/vllm-ascend test (full)
         env:
@@ -237,6 +238,7 @@ jobs:
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep
 
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_weight_load.py
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_pipeline_parallel.py
diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
index d06dece3..f5ce1730 100644
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -235,3 +235,29 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
             quantization="ascend",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0"})
+@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
+                    tensor_parallel_size=2,
+                    quantization="ascend",
+                    enable_expert_parallel=True,
+                    compilation_config={
+                        "cudagraph_capture_sizes": [3, 6, 9, 12],
+                        "cudagraph_mode": "FULL_DECODE_ONLY"
+                    },
+                    speculative_config={
+                        "num_speculative_tokens": 2,
+                        "method": "deepseek_mtp"
+                    },
+                    reasoning_parser="deepseek_v3",
+                    tokenizer_mode="deepseek_v32") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index aef66fa4..ced415c1 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -106,6 +106,8 @@ class EagleProposer(VllmEagleProposer):
         self.dcp_size = self.runner.dcp_size
         self.pcp_rank = self.runner.pcp_rank
         self.dcp_rank = self.runner.dcp_rank
+        
+        self.use_aclgraph = self.runner._use_aclgraph()
 
         self.full_indices = range(
             self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
index f2710050..eab2846d 100644
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -242,11 +242,11 @@ class MtpProposer(EagleProposer):
         assert self.runner is not None
 
         # Note(qcs): We may need to refactor these check logics.
-        if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
+        if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
                 -1]:
             num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
-        elif self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[
+        elif self.use_aclgraph  and num_tokens <= self.runner.cudagraph_batch_sizes[
                 -1]:
             # Acl graph mode, add padding to the batch size
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)