diff --git a/.github/doc_codespell.yaml b/.github/workflows/doc_codespell.yaml
similarity index 100%
rename from .github/doc_codespell.yaml
rename to .github/workflows/doc_codespell.yaml
diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml
new file mode 100644
index 0000000..5b84a24
--- /dev/null
+++ b/.github/workflows/image_310p_openeuler.yml
@@ -0,0 +1,114 @@
+name: 'image / openEuler'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main / vllm-ascend:*-dev
+# 3. tags push trigger image publish
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3-openeuler|latest / vllm-ascend:v1.2.3rc1-openeuler
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_310p_openeuler.yml'
+      - 'Dockerfile.310p.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_310p.openeuler.yml'
+      - 'Dockerfile.310p.openEuler'
+      - 'vllm_ascend/**'
+
+jobs:
+  build:
+    name: vllm-ascend image build
+    runs-on: >-
+      ${{
+          github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+          'ubuntu-latest' ||
+          'ubuntu-24.04-arm'
+      }}
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Print
+      run: |
+        lscpu
+
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N-openeuler is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1-openeuler, latest
+        #    - pre/post/dev: v0.7.1rc1-openeuler/v0.7.1rc1-openeuler/v0.7.1rc1.dev1-openeuler/v0.7.1.post1-openeuler, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-310p-openeuler
+          type=ref,event=pr,suffix=-openeuler
+          type=pep440,pattern={{raw}},suffix=-310p-openeuler
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push 310p
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/arm64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        file: Dockerfile.310p.openEuler
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml
new file mode 100644
index 0000000..02a59fb
--- /dev/null
+++ b/.github/workflows/image_310p_ubuntu.yml
@@ -0,0 +1,110 @@
+name: 'image / Ubuntu'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main / vllm-ascend:*-dev
+# 3. tags push trigger image publish
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3|latest / vllm-ascend:v1.2.3rc1
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_310p.ubuntu.yml'
+      - 'Dockerfile.310p'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_310p_ubuntu.yml'
+      - 'Dockerfile.310p'
+      - 'vllm_ascend/**'
+jobs:
+
+  build:
+    name: vllm-ascend image build
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Print
+      run: |
+        lscpu
+
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1, latest
+        #    - pre/post/dev: v0.7.1rc1/v0.7.1rc1/v0.7.1rc1.dev1/v0.7.1.post1, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-310p
+          type=ref,event=pr,suffix=-310p
+          type=pep440,pattern={{raw}},suffix=-310p
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push 310p
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/amd64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        file: Dockerfile.310p
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml
index 690d814..c954e56 100644
--- a/.github/workflows/image_openeuler.yml
+++ b/.github/workflows/image_openeuler.yml
@@ -94,7 +94,7 @@ jobs:
         username: ${{ vars.QUAY_USERNAME }}
         password: ${{ secrets.QUAY_PASSWORD }}
 
-    - name: Build and push
+    - name: Build and push 910b
       uses: docker/build-push-action@v6
       with:
         platforms: >-
diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml
index a2cfbce..69fe385 100644
--- a/.github/workflows/image_ubuntu.yml
+++ b/.github/workflows/image_ubuntu.yml
@@ -90,7 +90,7 @@ jobs:
         username: ${{ vars.QUAY_USERNAME }}
         password: ${{ secrets.QUAY_PASSWORD }}
 
-    - name: Build and push
+    - name: Build and push 910b
       uses: docker/build-push-action@v6
       with:
         platforms: >-
@@ -101,6 +101,7 @@ jobs:
           }}
         # use the current repo path as the build context, ensure .git is contained
         context: .
+        file: Dockerfile
         # only trigger when tag, branch/main push
         push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
         labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile.310p b/Dockerfile.310p
new file mode 100644
index 0000000..fffe73e
--- /dev/null
+++ b/Dockerfile.310p
@@ -0,0 +1,61 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.1.rc1-310p-ubuntu22.04-py3.10
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.9.1
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export SOC_VERSION=ASCEND310P3 && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope ray && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
new file mode 100644
index 0000000..da4718c
--- /dev/null
+++ b/Dockerfile.310p.openEuler
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.1.rc1-310p-openeuler22.03-py3.10
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN yum update -y && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    rm -rf /var/cache/yum
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.9.1
+
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export SOC_VERSION=ASCEND310P3 && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope ray && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
index 47ce826..25aeb60 100644
--- a/csrc/kernels/get_masked_input_and_mask_kernel.cpp
+++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
@@ -54,6 +54,7 @@ public:
         pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool));
         
         // Initialize calculation buffers
+        // NOTE: calc_buf_1 and calc_buf_2 are also used for int16 casting on older archs.
         pipe.InitBuffer(calc_buf_1, size_ * sizeof(float));
         pipe.InitBuffer(calc_buf_2, size_ * sizeof(float));
         
@@ -66,7 +67,7 @@ public:
         // Initialize temporary buffers
         pipe.InitBuffer(start_buf, size_ * sizeof(float));
         pipe.InitBuffer(end_buf, size_ * sizeof(float));
-        pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float));
+        pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); // Also used for half intermediate in casting
         pipe.InitBuffer(validOffset_buf, size_ * sizeof(float));
         pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t));
         pipe.InitBuffer(ones_buf_, size_ * sizeof(float));
@@ -121,7 +122,6 @@ private:
         const float start_value, 
         const float end_value) {
         
-        // Use already initialized buffers
         AscendC::LocalTensor<float> start_value_tensor = start_buf.Get<float>();
         AscendC::LocalTensor<float> end_value_tensor = end_buf.Get<float>();
 
@@ -134,7 +134,35 @@ private:
         CompareWithValue(ge_result, start_value_tensor, input, true);
         CompareWithValue(lt_result, input, end_value_tensor, false);
         
+#if (__CCE_AICORE__ >= 220) 
         AscendC::And(range_mask, ge_result, lt_result, size_);
+#else
+        {
+            // WORKAROUND for older arch
+            // No direct int8->int16 cast. Use half as intermediate.
+            // No direct int8 And. Use int16 And.
+            AscendC::LocalTensor<int16_t> ge_result_i16 = calc_buf_1.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> lt_result_i16 = calc_buf_2.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> range_mask_i16 = ge_result_i16; 
+            
+            // Use a temporary buffer for half type
+            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
+
+            // 1. Cast inputs: int8_t -> half -> int16_t
+            AscendC::Cast(tmp_half, ge_result, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(ge_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+            
+            AscendC::Cast(tmp_half, lt_result, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(lt_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            // 2. Perform And on int16_t tensors
+            AscendC::And(range_mask_i16, ge_result_i16, lt_result_i16, size_);
+
+            // 3. Cast result back: int16_t -> half -> int8_t
+            AscendC::Cast(tmp_half, range_mask_i16, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(range_mask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+        }
+#endif
     }
 
     __aicore__ inline void Compute() {
@@ -145,24 +173,18 @@ private:
         AscendC::LocalTensor<float> inputFloat = inputFloat_buf.Get<float>();
         AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_);
 
-        // Calculate mask for org_vocab range
-        // org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
         AscendC::LocalTensor<int8_t> orgVocabMask = result_org_mask_que.AllocTensor<int8_t>();
         ComputeRangeMask(orgVocabMask, 
                         inputFloat,
                         static_cast<float>(org_vocab_start_index_),
                         static_cast<float>(org_vocab_end_index_));
 
-        // Calculate mask for added_vocab range
-        // added_vocab_mask = (input_ >= added_vocab_start_index) & (input_ < added_vocab_end_index)
         AscendC::LocalTensor<int8_t> addedVocabMask = result_add_mask_que.AllocTensor<int8_t>();
         ComputeRangeMask(addedVocabMask,
                         inputFloat,
                         static_cast<float>(added_vocab_start_index_),
                         static_cast<float>(added_vocab_end_index_));
 
-        // Calculate validOffset
-        // valid_offset = (org_vocab_start_index * org_vocab_mask) + (added_offset * added_vocab_mask)
         AscendC::LocalTensor<float> validOffset = validOffset_buf.Get<float>();
         AscendC::LocalTensor<float> constOrgStartIndex = start_buf.Get<float>();
         
@@ -173,10 +195,7 @@ private:
         AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
         AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
 
-        AscendC::Mul(validOffset, 
-            constOrgStartIndex,
-            orgVocabMask_fp32,
-            size_);
+        AscendC::Mul(validOffset, constOrgStartIndex, orgVocabMask_fp32, size_);
 
         AscendC::LocalTensor<float> addedOffset;
         AscendC::LocalTensor<float> addedOffsetTensor = end_buf.Get<float>();
@@ -187,44 +206,61 @@ private:
         AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
         AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
 
-        AscendC::Mul(addedOffset, 
-            addedOffsetTensor,
-            addedVocabMask_fp32,
-            size_);
-            
+        AscendC::Mul(addedOffset, addedOffsetTensor, addedVocabMask_fp32, size_);
         AscendC::Add(validOffset, validOffset, addedOffset, size_);
 
-        // vocab_mask = org_vocab_mask | added_vocab_mask
         AscendC::LocalTensor<int8_t> vocabMask = vocabMask_buf_.Get<int8_t>();
-
+        
+#if (__CCE_AICORE__ >= 220)
         AscendC::Or(vocabMask,
                     orgVocabMask,
                     addedVocabMask,
                     size_);
-                    
+#else
+        {
+            // WORKAROUND for older arch 
+            // No direct int8->int16 cast. Use half as intermediate.
+            // No direct int8 Or. Use int16 Or.
+            AscendC::LocalTensor<int16_t> orgVocabMask_i16 = calc_buf_1.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> addedVocabMask_i16 = calc_buf_2.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> vocabMask_i16 = orgVocabMask_i16; 
+
+            // Use a temporary buffer for half type. inputFloat_buf is free now.
+            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
+
+            // 1. Cast inputs: int8_t -> half -> int16_t
+            AscendC::Cast(tmp_half, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(orgVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            AscendC::Cast(tmp_half, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(addedVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            // 2. Perform Or on int16_t tensors
+            AscendC::Or(vocabMask_i16, orgVocabMask_i16, addedVocabMask_i16, size_);
+
+            // 3. Cast result back: int16_t -> half -> int8_t
+            AscendC::Cast(tmp_half, vocabMask_i16, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(vocabMask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+        }
+#endif
+
         AscendC::Sub(inputFloat, inputFloat, validOffset, size_);
 
-        // input_ = vocab_mask * (input_ - valid_offset)
         AscendC::LocalTensor<half> vocabMask_fp16;
         AscendC::LocalTensor<float> vocabMask_fp32;
         AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_);
         AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
         
-        AscendC::LocalTensor<float> inputFloat_fp32;
         AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_);
 
         AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_);  
         outQueue.EnQue(maskedLocal);
 
-        // ~vocab_mask
         AscendC::LocalTensor<float> ones_tensor = ones_buf_.Get<float>();
         AscendC::Duplicate(ones_tensor, (float)1, size_);
         AscendC::LocalTensor<float> maskLocal_fp32;
 
-        AscendC::Sub(maskLocal_fp32, 
-            ones_tensor,
-            vocabMask_fp32,
-            size_);
+        AscendC::Sub(maskLocal_fp32, ones_tensor, vocabMask_fp32, size_);
 
         AscendC::LocalTensor<half> maskLocal_fp16;
         AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_);
@@ -262,8 +298,6 @@ private:
     // Temporary buffers
     AscendC::TBuf<AscendC::TPosition::VECCALC> start_buf;
     AscendC::TBuf<AscendC::TPosition::VECCALC> end_buf; 
-    
-    // Temporary buffers continued
     AscendC::TBuf<AscendC::TPosition::VECCALC> inputFloat_buf;
     AscendC::TBuf<AscendC::TPosition::VECCALC> validOffset_buf;
     AscendC::TBuf<AscendC::TPosition::VECCALC> vocabMask_buf_;
@@ -342,4 +376,3 @@ void get_masked_input_and_mask_impl(
 }
 
 } // namespace vllm_ascend
-    
diff --git a/csrc/kernels/pos_encoding_kernels.cpp b/csrc/kernels/pos_encoding_kernels.cpp
index 0b77ce8..57af050 100644
--- a/csrc/kernels/pos_encoding_kernels.cpp
+++ b/csrc/kernels/pos_encoding_kernels.cpp
@@ -30,7 +30,11 @@ using vllm_ascend::local_mem_copy;
 template <typename scalar_t, bool isNeox> class RotaryEmbedding {
     // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to
     // retrieve this size from runtime for more Soc support
-    static int constexpr loadSize = 512;
+    #if (__CCE_AICORE__ >= 220)
+        static int constexpr loadSize = 512;
+    #else
+        static int constexpr loadSize = 1024 * 4;
+    #endif
     using dst_t = scalar_t;
     using acc_t = typename AccType<scalar_t>::type;
     // only half tensor have cast instruct to int8, hardcode acc_dst_t as half
@@ -326,7 +330,9 @@ private:
 
 // Declare all the kernel entry here
 ROPE_CUSTOM_KERNEL_DECLARE(half)
-ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t)
+#if (__CCE_AICORE__ >= 220)
+    ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t)
+#endif
 
 namespace vllm_ascend {
 
@@ -342,7 +348,7 @@ namespace vllm_ascend {
             reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
             numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);
 
-// maximum number for runtime to launch a ascendc kernel. 
+// maximum number for runtime to launch a ascendc kernel.
 // we use this to constrain the maximum number of block size
 static const int64_t maxParallelSize = 65535;
 
@@ -357,9 +363,13 @@ extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, in
     int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize;
     if (type == AscendType::FP16) {
         ROTARY_EMBEDDING_KERNEL_CALL(half);
-    } else if (type == AscendType::BF16) {
+    }
+    #if (__CCE_AICORE__ >= 220)
+    else if (type == AscendType::BF16) {
         ROTARY_EMBEDDING_KERNEL_CALL(bfloat16_t);
-    } else {
+    }
+    #endif
+    else {
         return;
     }
 }
diff --git a/csrc/kernels/utils.h b/csrc/kernels/utils.h
index 8b8cf21..c2d4261 100644
--- a/csrc/kernels/utils.h
+++ b/csrc/kernels/utils.h
@@ -20,9 +20,11 @@ namespace vllm_ascend {
 
 template <typename scalar_t> struct AccType;
 
+#if (__CCE_AICORE__ >= 220)
 template <> struct AccType<bfloat16_t> {
-    using type = float;
+  using type = float;
 };
+#endif
 
 template <> struct AccType<half> {
     using type = half;
diff --git a/format.sh b/format.sh
index e49ac91..2a00ce3 100755
--- a/format.sh
+++ b/format.sh
@@ -273,7 +273,7 @@ echo 'vllm-ascend isort: Done'
 # Clang-format section
 # Exclude some files for formatting because they are vendored
 CLANG_FORMAT_EXCLUDES=(
-    'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h'
+    'csrc/kernels/utils.h' 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h'
 )
 
 # Format specified files with clang-format
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
index 9c112ed..4b545a1 100644
--- a/vllm_ascend/attention/attention.py
+++ b/vllm_ascend/attention/attention.py
@@ -36,7 +36,8 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ops.cache import concat_and_cache_mla
-from vllm_ascend.utils import enable_custom_op
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16,
+                               enable_custom_op, is_310p, nd_to_nz_2d)
 from vllm_ascend.worker.model_runner import (
     ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
 
@@ -170,7 +171,11 @@ class AscendAttentionBackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
+        if is_310p():
+            return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
+                    16)
+        else:
+            return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -654,6 +659,11 @@ class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]):
                 # normal mask
                 self.attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
                     max_prefill_seq_len, dtype, device)
+                if is_310p():
+                    mask_nz = nd_to_nz_2d(self.attn_mask)
+                    mask_nz = torch_npu.npu_format_cast(
+                        mask_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ)
+                    self.attn_mask = mask_nz
             elif self.num_decode_tokens == 0 and not self.input_builder.chunked_prefill_enabled:
                 # compress mask for prefix cache
                 self.compress_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
@@ -868,6 +878,18 @@ class AscendAttentionBackendImpl(AttentionImpl):
                         self.seq_lens_tensor_cpu = torch.from_numpy(
                             np.array(attn_metadata.prefill_metadata.seq_lens).
                             astype(np.int32))
+                        if is_310p():
+                            # align q k v output tensors
+                            query = aligned_16(query)
+                            key = aligned_16(key)
+                            value = aligned_16(value)
+                            output = aligned_16(output)
+
+                            # do reformat in case of broadcasted tensors
+                            mask = mask.repeat(
+                                self.seq_lens_tensor_cpu.size(0), 1, 1, 1)
+                            mask = torch_npu.npu_format_cast(
+                                mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
                         torch_npu._npu_flash_attention(
                             query=query,
                             key=key,
@@ -878,6 +900,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
                             num_heads=self.num_heads,
                             num_kv_heads=self.num_kv_heads,
                             out=output)
+                        output = output[:num_tokens, :, :]
                 # Prefix cache only and cache hit
                 elif attn_metadata.num_decode_tokens == 0 and not attn_metadata.chunked_prefill_enabled:
                     assert kv_cache is not None
@@ -935,6 +958,10 @@ class AscendAttentionBackendImpl(AttentionImpl):
                 self.seq_lens_tensor_cpu = torch.from_numpy(
                     np.array(attn_metadata.decode_metadata.seq_lens).astype(
                         np.int32))
+                if is_310p():
+                    # # seq_lens_tensor needs to be transferred to the device for 310P
+                    self.seq_lens_tensor_cpu = self.seq_lens_tensor_cpu.to(
+                        device=self.key_cache.device)
                 block_tables = attn_metadata.decode_metadata.block_tables
                 torch_npu._npu_paged_attention(
                     query=query,
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index e6a2376..0aac026 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -30,6 +30,8 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
+                               nd_to_nz_2d, nd_to_nz_spec)
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -62,6 +64,9 @@ class AscendAttentionBackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
+        if is_310p():
+            return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
+                    16)
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
@@ -167,6 +172,16 @@ class AscendAttentionMetadataBuilder:
         query_start_loc = query_start_loc_cpu.to(self.runner.device,
                                                  non_blocking=True)
 
+        if is_310p():
+            if attn_state == AscendAttentionState.PrefillNoCache:
+                mask_nz = nd_to_nz_2d(attn_mask)
+                attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
+                                                      ACL_FORMAT_FRACTAL_NZ)
+            elif attn_state == AscendAttentionState.ChunkedPrefill:
+                mask_nz = nd_to_nz_spec(attn_mask)
+                attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
+                                                      ACL_FORMAT_FRACTAL_NZ)
+
         attn_metadata = AscendMetadata(
             num_actual_tokens=num_actual_tokens,
             block_tables=block_table,
@@ -250,6 +265,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
                                  self.head_size,
                                  dtype=query.dtype,
                                  device=query.device)
+        ori_output = output
         if trace_flag:
             torch.ops.vllm.unified_ascend_attention_with_output(
                 query=query,
@@ -294,6 +310,18 @@ class AscendAttentionBackendImpl(AttentionImpl):
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
                 mask = attn_metadata.attn_mask
+                if is_310p():
+                    # align q k v output tensors
+                    query = aligned_16(query)
+                    key = aligned_16(key)
+                    value = aligned_16(value)
+                    output = aligned_16(output)
+
+                    # do reformat in case of broadcasted tensors
+                    mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1)
+                    mask = torch_npu.npu_format_cast(mask.contiguous(),
+                                                     ACL_FORMAT_FRACTAL_NZ)
+
                 torch_npu._npu_flash_attention(query=query,
                                                key=key,
                                                value=value,
@@ -303,6 +331,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
                                                num_heads=self.num_heads,
                                                num_kv_heads=self.num_kv_heads,
                                                out=output)
+                output = output[:num_tokens, :, :]
             elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
@@ -320,6 +349,10 @@ class AscendAttentionBackendImpl(AttentionImpl):
                     scale_value=self.scale,
                     out=output)
             elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+                if is_310p():
+                    # # seq_lens_tensor needs to be transferred to the device for 310P
+                    attn_metadata.seq_lens = \
+                        attn_metadata.seq_lens.to(device=query.device)
                 torch_npu._npu_paged_attention(
                     query=query,
                     key_cache=self.key_cache,
@@ -353,6 +386,14 @@ class AscendAttentionBackendImpl(AttentionImpl):
                                             self.scale, None, True)
                 else:
                     # use paged attention
+                    assert attn_metadata is not None
+                    assert attn_metadata.attn_mask is not None
+                    if is_310p():
+                        # do reformat in case of broadcasted tensors
+                        attn_metadata.attn_mask = \
+                            torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
+                        attn_metadata.seq_lens = \
+                            attn_metadata.seq_lens.to(device=query.device)
                     torch_npu._npu_paged_attention_splitfuse(
                         query=query,
                         key_cache=self.key_cache,
@@ -365,6 +406,9 @@ class AscendAttentionBackendImpl(AttentionImpl):
                         num_heads=self.num_heads,
                         scale_value=self.scale,
                         out=output)
+
+        # to make in-place change to the output tensor
+        ori_output[:, :, :] = output[:num_tokens, :, :]
         return output.view(num_tokens, self.hidden_size)
 
 
diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py
index 13541ee..1c32643 100644
--- a/vllm_ascend/ops/activation.py
+++ b/vllm_ascend/ops/activation.py
@@ -18,11 +18,16 @@
 import torch
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
 
+from vllm_ascend.utils import is_310p
+
 
 def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
     import torch_npu
 
-    out = torch_npu.npu_swiglu(x)
+    if is_310p():
+        out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
+    else:
+        out = torch_npu.npu_swiglu(x)
     return out
 
 
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
index 285b728..3c84f23 100644
--- a/vllm_ascend/ops/common_fused_moe.py
+++ b/vllm_ascend/ops/common_fused_moe.py
@@ -21,7 +21,9 @@ import torch
 from vllm.model_executor.layers.fused_moe.layer import \
     UnquantizedFusedMoEMethod
 
-from vllm_ascend.ops.fused_moe import fused_experts, select_experts
+from vllm_ascend.ops.fused_moe import (fused_experts, fused_experts_310p,
+                                       select_experts)
+from vllm_ascend.utils import is_310p
 
 
 def forward_oot(
@@ -56,6 +58,19 @@ def forward_oot(
         e_score_correction_bias=e_score_correction_bias,
     )
 
+    if is_310p():
+        assert global_num_experts is not None
+        return fused_experts_310p(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            top_k=top_k,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input)
+
     return fused_experts(
         hidden_states=x,
         w1=layer.w13_weight,
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index c282f7e..c1c865b 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -549,6 +549,95 @@ def fused_experts_with_all2all_buffer(
     return final_hidden_states
 
 
+# Currently, fused_experts on 310p only supports PanguProMoE.
+def fused_experts_310p(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    """
+
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
+        w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+        top_k: Number of experts to select.
+        expert_map: Expert mapping of shape (num_experts,).
+
+    Returns:
+        hidden_states: Hidden states after routing.
+    """
+    ep_size = get_ep_group().world_size
+    local_num_experts = global_num_experts // ep_size
+    local_num_group = top_k // ep_size
+
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+
+    bsz, _ = hidden_states.shape
+    flatten_topk_ids = topk_ids.view(-1)
+    sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
+    sorted_topk_ids = sorted_topk_ids.to(torch.int32)
+    sorted_hidden_states = hidden_states.index_select(
+        0, sorted_topk_ids // local_num_group)
+
+    experts_id = torch.arange(0,
+                              local_num_experts,
+                              dtype=topk_ids.dtype,
+                              device=topk_ids.device)
+    num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
+        torch.float32).sum(0)
+    topk_scales = topk_weights.view(-1).index_select(
+        0, sorted_topk_ids).unsqueeze(-1)
+    group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
+
+    w1 = w1.transpose(1, 2)
+    gate_up_out = torch_npu.npu_grouped_matmul(
+        x=[sorted_hidden_states],
+        weight=[w1],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=group_list,
+    )[0]
+
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
+        torch.float16)
+    gate_up_out *= topk_scales
+
+    w2 = w2.transpose(1, 2)
+    down_out_list = torch_npu.npu_grouped_matmul(
+        x=[gate_up_out],
+        weight=[w2],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=group_list,
+    )[0]
+
+    unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(
+        torch.int32) + torch.Tensor([0]).to(torch.int32).npu()
+    unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids)
+    final_hidden_states = unsorted_hidden_states.reshape(
+        bsz, top_k // ep_size, -1).sum(1)
+
+    return final_hidden_states
+
+
 def fused_experts(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
index 8ff4c55..7b839fe 100644
--- a/vllm_ascend/ops/layernorm.py
+++ b/vllm_ascend/ops/layernorm.py
@@ -20,6 +20,8 @@ from typing import Optional, Tuple, Union
 import torch
 from vllm.model_executor.layers.layernorm import RMSNorm
 
+from vllm_ascend.utils import is_310p
+
 
 def forward_oot(
     self,
@@ -29,8 +31,15 @@ def forward_oot(
     import torch_npu
 
     if residual is not None:
-        x, _, residual = torch_npu.npu_add_rms_norm(x, residual, self.weight,
-                                                    self.variance_epsilon)
+        if is_310p():
+            orig_dtype = residual.dtype
+            x = x + residual.to(x.dtype)
+            residual = x.to(orig_dtype)
+            x, _ = torch_npu.npu_rms_norm(x, self.weight,
+                                          self.variance_epsilon)
+        else:
+            x, _, residual = torch_npu.npu_add_rms_norm(
+                x, residual, self.weight, self.variance_epsilon)
         return x, residual
 
     x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon)
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
index 39a4c1c..2722679 100644
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -22,7 +22,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
-from vllm_ascend.utils import enable_custom_op
+from vllm_ascend.utils import enable_custom_op, is_310p
 
 
 def custom_rotary_embedding_enabled(query, neox_style, head_size):
@@ -48,7 +48,8 @@ def rope_forward_oot(
     if is_neox_style_override is not None:
         neox_style = is_neox_style_override
     # adopt custom kernel path for rotary_embedding
-    if custom_rotary_embedding_enabled(query, neox_style, self.head_size):
+    if custom_rotary_embedding_enabled(query, neox_style,
+                                       self.head_size) and not is_310p():
         query, key = torch.ops._C.rotary_embedding(
             positions,
             query,
diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
index 86515df..d094886 100644
--- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -17,6 +17,7 @@
 # Adapted from vllm/model_executor/models/qwen2_vl.py
 # This file is a part of the vllm-ascend project.
 
+import torch
 import vllm
 import vllm.distributed
 import vllm.envs as envs
@@ -25,6 +26,8 @@ from vllm.config import ParallelConfig
 from vllm.distributed.utils import \
     stateless_init_torch_distributed_process_group
 
+from vllm_ascend.utils import NullHandle, is_310p
+
 
 def ascend_destroy_model_parallel():
     """Set the groups to none and destroy them."""
@@ -81,3 +84,70 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
 vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
 ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
 ParallelConfig.stateless_init_dp_group = stateless_init_dp_group
+
+
+def communication_adaptation_310p():
+
+    def broadcast310p(tensor, src, group=None, async_op=False):
+        rank = torch.distributed.get_rank(group)
+        world_size = torch.distributed.get_world_size(group)
+        tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+        tensor_list[rank] = tensor
+        torch.distributed.all_gather(tensor_list, tensor, group=group)
+        tensor[...] = tensor_list[src]
+        if async_op:
+            return NullHandle()
+        else:
+            return None
+
+    torch.distributed.broadcast = broadcast310p
+    torch.distributed.distributed_c10d.broadcast = broadcast310p
+
+    def all_reduce_wrapper_310p(fn):
+
+        def all_reduce(
+            tensor,
+            op=torch.distributed.ReduceOp.SUM,
+            group=None,
+            async_op=False,
+        ):
+            if tensor.dtype != torch.int64:
+                return fn(tensor, op, group, async_op)
+            rank = torch.distributed.get_rank(group)
+            world_size = torch.distributed.get_world_size(group)
+            tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+            tensor_list[rank] = tensor
+            torch.distributed.all_gather(tensor_list, tensor, group=group)
+            if op == torch.distributed.ReduceOp.SUM:
+                return torch.stack(tensor_list).sum(0)
+            elif op == torch.distributed.ReduceOp.MAX:
+                return torch.tensor(
+                    torch.stack(tensor_list).cpu().numpy().max(0),
+                    device=tensor.device,
+                )
+            else:
+                raise RuntimeError(f"not implement op {op}")
+
+        return all_reduce
+
+    torch.distributed.all_reduce = all_reduce_wrapper_310p(
+        torch.distributed.all_reduce)
+    torch.distributed.distributed_c10d.all_reduce = all_reduce_wrapper_310p(
+        torch.distributed.distributed_c10d.all_reduce)
+
+    def reduce_scatter_310p(output_tensor, input_tensor, group=None):
+        rank = torch.distributed.get_rank(group)
+        world_size = torch.distributed.get_world_size(group)
+        torch.distributed.all_reduce(input_tensor,
+                                     torch.distributed.ReduceOp.SUM,
+                                     group,
+                                     async_op=False)
+        interval = input_tensor.shape[0] // world_size
+        output_tensor[:] = input_tensor[rank * interval:(rank + 1) * interval]
+
+    torch.distributed._reduce_scatter_base = reduce_scatter_310p
+    torch.distributed.distributed_c10d._reduce_scatter_base = reduce_scatter_310p
+
+
+if is_310p():
+    communication_adaptation_310p()
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index b9233da..881b732 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -28,7 +28,8 @@ from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum
 
 from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
-from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
+from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD, is_310p,
+                               update_aclgraph_sizes)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -205,8 +206,9 @@ class NPUPlatform(Platform):
                 cache_config.block_size = 128
 
         if envs.VLLM_USE_V1:
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
+            # Activate custom ops for v1, except on 310P
+            if not is_310p():
+                compilation_config.custom_ops = ["all"]
 
             # If ascend_scheduler_config is enabled,
             # extents original scheduler_config to use AscendScheduler.
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index eeab287..1a59036 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -21,11 +21,12 @@ import atexit
 import math
 from contextlib import contextmanager, nullcontext
 from enum import Enum
+from functools import lru_cache
 from threading import Lock
 from typing import TYPE_CHECKING, List, Tuple
 
 import torch
-import torch_npu  # noqa: F401
+import torch_npu  # noqa: F401  # noqa: F401
 import torchair  # type: ignore[import]  # noqa: F401
 from packaging.version import InvalidVersion, Version
 from torch_npu.npu.streams import Event
@@ -57,6 +58,116 @@ ASCEND_QUATIZATION_METHOD = "ascend"
 
 CUSTOM_OP_ENABLED = None
 
+SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
+
+ACL_FORMAT_FRACTAL_ND = 2
+ACL_FORMAT_FRACTAL_NZ = 29
+
+
+@lru_cache(maxsize=None)
+def _get_soc_version():
+    """Gets the SOC version and caches it."""
+    if not torch.npu.is_available():
+        return ""
+    device_count = torch.npu.device_count()
+    if device_count <= 0:
+        return ""
+    try:
+        return torch.npu.get_device_name(0)
+    except Exception:
+        return ""
+
+
+_SOC_VERSION = _get_soc_version()
+
+
+def is_310p():
+    return _SOC_VERSION in SOC_VERSION_INFERENCE_SERIES
+
+
+class NullHandle:
+
+    def __init__(self):
+        pass
+
+    def wait(self):
+        pass
+
+
+def _round_up(x: int, align: int):
+    if align == 0:
+        return -1
+    return (x + align - 1) // align * align
+
+
+def _custom_pad(x, pad_dims):
+    return torch.nn.functional.pad(x, pad_dims)
+
+
+def _custom_reshape(x, target_shape):
+    return x.reshape(target_shape)
+
+
+def _custom_transpose(x, dim1, dim2):
+    return x.transpose(dim1, dim2)
+
+
+def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor:
+    aux_dims = [0, 0, 0, 0]
+    aux_dims[0] = 1
+    aux_dims[1] = _round_up(in_tensor.size(0), 16)
+
+    pad_dims = [0, 0, 0, 0]
+    pad_dims[3] = _round_up(in_tensor.size(0), 16) - in_tensor.size(0)
+
+    aux_dims[2] = _round_up(in_tensor.size(1), 16) // 16
+    aux_dims[3] = 16
+    pad_dims[1] = _round_up(in_tensor.size(1), 16) - in_tensor.size(1)
+
+    return _custom_transpose(
+        _custom_reshape(_custom_pad(in_tensor, pad_dims), aux_dims), 1,
+        2).contiguous()
+
+
+def nd_to_nz_spec(mask_tensor: torch.Tensor) -> torch.Tensor:
+    num_tokens = mask_tensor.shape[0]
+    max_seq_len = mask_tensor.shape[1]
+
+    tokens_pad = (num_tokens + 15) // 16 * 16
+    max_seq_len_pad = (max_seq_len + 15) // 16 * 16
+
+    mask_tensor_pad = \
+        torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=mask_tensor.dtype, device=mask_tensor.device)
+    mask_tensor_pad[0][:num_tokens, :max_seq_len] = mask_tensor
+    mask = mask_tensor_pad.reshape(
+        (1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3)
+    return mask
+
+
+def aligned_16(tensor: torch.Tensor):
+    """Aligned tensor for 310P"""
+
+    # Get the size of the current 0th dimension
+    n = tensor.size(0)
+
+    # Calculate the aligned size
+    n_aligned = ((n + 15) // 16) * 16
+
+    # If already aligned, return the original tensor
+    if n == n_aligned:
+        return tensor
+
+    # Create a new tensor with shape (n_aligned, H, W) and fill it with zeros
+    new_tensor = torch.zeros(n_aligned,
+                             *tensor.shape[1:],
+                             dtype=tensor.dtype,
+                             device=tensor.device)
+
+    # Copy the original tensor to the first N positions of the new tensor
+    new_tensor[:n] = tensor
+
+    return new_tensor
+
 
 def try_register_lib(lib_name: str, lib_info: str = ""):
     import importlib
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 2670111..5cf6bb3 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -74,7 +74,9 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
 from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
-from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
+                               ProfileExecuteDuration, is_310p,
+                               vllm_version_is)
 from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 
@@ -1911,6 +1913,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         """
         self.kv_cache_config = kv_cache_config
         import torch_npu
+        acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p(
+        ) else ACL_FORMAT_FRACTAL_ND
         kv_caches: Dict[str, torch.Tensor] = {}
 
         self.input_batch = InputBatch(
@@ -1968,13 +1972,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                             device=self.device)
                         kv_caches[layer_name] = (layer_kv_cache_nope,
                                                  layer_kv_cache_pe)
-                        torch_npu.npu_format_cast(kv_caches[layer_name][0], 2)
-                        torch_npu.npu_format_cast(kv_caches[layer_name][1], 2)
+                        kv_caches[layer_name] = (
+                            torch_npu.npu_format_cast(kv_caches[layer_name][0],
+                                                      acl_format),
+                            torch_npu.npu_format_cast(kv_caches[layer_name][1],
+                                                      acl_format),
+                        )
                     else:
                         kv_caches[layer_name] = torch.zeros(kv_cache_shape,
                                                             dtype=dtype,
                                                             device=self.device)
-                        torch_npu.npu_format_cast(kv_caches[layer_name], 2)
+                        kv_caches[layer_name] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name], acl_format)
                 else:
                     # TODO: add new branches when introducing more types of
                     # KV cache specs.
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index e78cc3f..bffc6a8 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -51,7 +51,8 @@ from vllm_ascend.ascend_config import init_ascend_config
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import try_register_lib
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
+                               is_310p, try_register_lib)
 from vllm_ascend.worker.model_runner import NPUModelRunner
 from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner
 
@@ -342,17 +343,22 @@ class NPUWorker(LocalOrDistributedWorkerBase):
             for _ in range(self.parallel_config.pipeline_parallel_size)
         ]
         import torch_npu
+        acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p(
+        ) else ACL_FORMAT_FRACTAL_ND
         for ve in range(self.parallel_config.pipeline_parallel_size):
             num_layers = len(self.cache_engine[ve].gpu_cache)
             for i in range(num_layers):
                 if torch.is_tensor(self.cache_engine[ve].gpu_cache[i]):
-                    torch_npu.npu_format_cast(
-                        self.cache_engine[ve].gpu_cache[i], 2)
+                    self.cache_engine[ve].gpu_cache[
+                        i] = torch_npu.npu_format_cast(
+                            self.cache_engine[ve].gpu_cache[i], acl_format)
                 else:
-                    torch_npu.npu_format_cast(
-                        self.cache_engine[ve].gpu_cache[i][0], 2)
-                    torch_npu.npu_format_cast(
-                        self.cache_engine[ve].gpu_cache[i][1], 2)
+                    self.cache_engine[ve].gpu_cache[i][
+                        0] = torch_npu.npu_format_cast(
+                            self.cache_engine[ve].gpu_cache[i][0], acl_format)
+                    self.cache_engine[ve].gpu_cache[i][
+                        1] = torch_npu.npu_format_cast(
+                            self.cache_engine[ve].gpu_cache[i][1], acl_format)
         self.gpu_cache = [
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)