diff --git a/requirements-dev.txt b/requirements-dev.txt
index abc2ca09..e6193c50 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -21,5 +21,5 @@ pytest_mock
 msserviceprofiler>=1.2.2
 mindstudio-probe>=8.3.0
 arctic-inference==0.1.1
-xlite==0.1.0rc1
+xlite==0.1.0rc3
 uc-manager
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 2d161a42..78046e3a 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -224,11 +224,15 @@ class NPUPlatform(Platform):
 
         from vllm.config.compilation import CUDAGraphMode
 
-        if ascend_config.xlite_graph_config.enabled and ascend_config.xlite_graph_config.full_mode:
-            logger.info("ACLGraph is disabled under xlite full mode")
-            enforce_eager = True
-            model_config.enforce_eager = True
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+        if ascend_config.xlite_graph_config.enabled:
+            if ascend_config.xlite_graph_config.full_mode:
+                logger.info("ACLGraph is disabled under xlite full mode")
+                enforce_eager = True
+                model_config.enforce_eager = True
+                compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+            else:
+                logger.info("Falling back to FULL_DECODE_ONLY under xlite decode-only mode")
+                compilation_config.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
 
         if enforce_eager:
             logger.info("Compilation disabled, using eager mode by default")
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 51018023..a2e49373 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2169,6 +2169,20 @@ class NPUModelRunner(GPUModelRunner):
             spec_decode_common_attn_metadata = spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
         return attn_metadata, spec_decode_common_attn_metadata
 
+    def _should_build_dummy_attn_metadata(
+        self,
+        force_attention: bool = False,
+        is_profile: bool = False,
+        cudagraph_runtime_mode: CUDAGraphMode | None = None,
+    ) -> bool:
+        """
+        Determine whether attention metadata should be built during dummy_run.
+        SubClass can override this to add custom conditions.
+        """
+        # If force_attention is True, we always capture attention, Otherwise,
+        # it only happens for cudagraph_runtime_mode=FULL.
+        return force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL
+
     @torch.inference_mode()
     def _dummy_run(
         self,
@@ -2272,9 +2286,8 @@ class NPUModelRunner(GPUModelRunner):
         # vllm-ascend does not support ubatch now
         ubatch_slices, ubatch_slices_padded = None, None
         attn_metadata: PerLayerAttnMetadata | None = None
-        # If force_attention is True, we always capture attention. Otherwise,
-        # it only happens for cudagraph_runtime_mode=FULL.
-        if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
+        # Build attention metadata for dummy_run
+        if self._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode):
             if create_mixed_batch:
                 raise NotImplementedError(
                     "create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it"
diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py
index 6a62b250..cd7423c9 100644
--- a/vllm_ascend/xlite/xlite.py
+++ b/vllm_ascend/xlite/xlite.py
@@ -157,9 +157,6 @@ class LlamaXliteModel(XliteModel):
 
 class QwenMoeXliteModel(LlamaXliteModel):
     def initialize(self, runnable: nn.Module, vllm_config: VllmConfig) -> tuple[Model, int, int, torch.dtype]:
-        if envs_ascend.VLLM_ASCEND_ENABLE_NZ == 2:
-            architecture = vllm_config.model_config.architectures[0]
-            raise ValueError(f"{architecture} not support VLLM_ASCEND_ENABLE_NZ = 2!")
         dtype = vllm_config.model_config.dtype
         config = self._build_model_config(vllm_config)
         xlite_model = self._build_model(runnable, vllm_config, config)
@@ -174,7 +171,6 @@ class QwenMoeXliteModel(LlamaXliteModel):
         config = super()._build_model_config(vllm_config)
         hf_config = vllm_config.model_config.hf_text_config
         ep_group = get_ep_group()
-        config.n_layers = hf_config.max_window_layers
         config.n_dense_layers = 0
         config.n_routed_experts = hf_config.num_experts
         config.n_shared_experts = 0
@@ -229,9 +225,8 @@ class XliteWrapper:
 
         rank = torch.distributed.get_rank()
         local_rank = get_world_group().local_rank
-        self.xlite_rt = Runtime(
-            local_rank, 0, rank, get_tensor_model_parallel_world_size(), vllm_config.parallel_config.data_parallel_size
-        )
+        self.data_parallel_size = vllm_config.parallel_config.data_parallel_size
+        self.xlite_rt = Runtime(local_rank, 0, rank, get_tensor_model_parallel_world_size(), self.data_parallel_size)
 
         (self.xlite_model, self.freq_cis, hidden_size, dtype) = xlite_model_init(runnable, vllm_config)
 
@@ -278,7 +273,16 @@ class XliteWrapper:
             AscendAttentionState.SpecDecoding,
         ]
 
-        if not with_prefill or self.full_mode:
+        # Full: graph for prefill and decode
+        # Decode-Only: runnable for prefill, graph for decode
+        if not self.full_mode and self.data_parallel_size > 1:
+            num_tokens = forward_context.batch_descriptor.num_tokens
+            num_reqs = forward_context.batch_descriptor.num_reqs
+            use_xlite_graph = num_reqs is not None and num_tokens <= num_reqs
+        else:
+            use_xlite_graph = not with_prefill or self.full_mode
+
+        if use_xlite_graph:
             # TODO: When vllm_ascend enables graph mode, attn_metadata.num_decodes
             # will be padded in decode requests. Therefore, it is first fixed using
             # num_decode_tokens. However, in the future, when MTP is enabled, there
@@ -299,7 +303,10 @@ class XliteWrapper:
             xlite_attn_metadata.is_prefills = [False] * num_decodes + [True] * num_prefills
             xlite_attn_metadata.block_tables = attn_metadata.block_tables.cpu().tolist()
 
-            h = self.hidden_states[: attn_metadata.num_actual_tokens]
+            # Compatibility between DP and Non-DP scenarios
+            num_tokens = forward_context.batch_descriptor.num_tokens
+            num_actual_tokens = attn_metadata.num_actual_tokens
+            h = self.hidden_states[:num_tokens]
             stream = torch.npu.current_stream().npu_stream
             if inputs_embeds is None:
                 self.xlite_model.forward(
@@ -309,6 +316,6 @@ class XliteWrapper:
                 self.xlite_model.forward_with_inputs_embeds(
                     self.xlite_rt, inputs_embeds, xlite_attn_metadata, self.kv_caches, self.freq_cis, h, stream
                 )
-            return h
+            return h[:num_actual_tokens]
         else:
             return self.runnable(input_ids, positions, intermediate_tensors, inputs_embeds)
diff --git a/vllm_ascend/xlite/xlite_model_runner.py b/vllm_ascend/xlite/xlite_model_runner.py
index 44fc7ed0..c7322bc2 100644
--- a/vllm_ascend/xlite/xlite_model_runner.py
+++ b/vllm_ascend/xlite/xlite_model_runner.py
@@ -17,6 +17,7 @@
 # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
 # isort: skip_file
 import torch.nn as nn
+from vllm.config import CUDAGraphMode
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
@@ -34,3 +35,18 @@ class XliteModelRunner(NPUModelRunner):
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         super().initialize_kv_cache(kv_cache_config)
         self.model.register_kv_caches(self.kv_caches)
+
+    def _should_build_dummy_attn_metadata(
+        self,
+        force_attention: bool = False,
+        is_profile: bool = False,
+        cudagraph_runtime_mode: CUDAGraphMode | None = None,
+    ) -> bool:
+        """
+        Override to build attention metadata during dummy_run when xlite is enable.
+        For xlite, we need to build metadata during DP dummy_run to ensure all ranks
+        have consistent metadata, even when some ranks have no requests.
+        """
+        base_condition = super()._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode)
+        xlite_condition = self.ascend_config.xlite_graph_config.enabled and not is_profile
+        return base_condition or xlite_condition