add Dockerfile and readme

vllm-ascend vnpu v1
[Doc] Update version policy to the latest. (#5071 )
2026-01-05 09:32:26 +00:00 · 2025-12-26 07:37:35 +00:00 · 2025-12-16 15:24:46 +08:00 · 2025-12-16 15:04:31 +08:00 · 2025-12-16 14:09:52 +08:00 · 2025-12-16 12:47:40 +08:00
373 changed files with 36100 additions and 3752 deletions
--- a/.github.backup/Dockerfile.buildwheel
+++ b/.github.backup/Dockerfile.buildwheel
--- a/.github.backup/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github.backup/ISSUE_TEMPLATE/100-documentation.yml
--- a/.github.backup/ISSUE_TEMPLATE/110-user-story.yml
+++ b/.github.backup/ISSUE_TEMPLATE/110-user-story.yml
--- a/.github.backup/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github.backup/ISSUE_TEMPLATE/200-installation.yml
--- a/.github.backup/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github.backup/ISSUE_TEMPLATE/300-usage.yml
--- a/.github.backup/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github.backup/ISSUE_TEMPLATE/400-bug-report.yml
--- a/.github.backup/ISSUE_TEMPLATE/500-feature-request.yml
+++ b/.github.backup/ISSUE_TEMPLATE/500-feature-request.yml
--- a/.github.backup/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github.backup/ISSUE_TEMPLATE/600-new-model.yml
--- a/.github.backup/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github.backup/ISSUE_TEMPLATE/700-performance-discussion.yml
--- a/.github.backup/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github.backup/ISSUE_TEMPLATE/750-RFC.yml
--- a/.github.backup/ISSUE_TEMPLATE/800-others.yml
+++ b/.github.backup/ISSUE_TEMPLATE/800-others.yml
--- a/.github.backup/ISSUE_TEMPLATE/900-release-checklist.yml
+++ b/.github.backup/ISSUE_TEMPLATE/900-release-checklist.yml
--- a/.github.backup/ISSUE_TEMPLATE/config.yml
+++ b/.github.backup/ISSUE_TEMPLATE/config.yml
--- a/.github.backup/PULL_REQUEST_TEMPLATE.md
+++ b/.github.backup/PULL_REQUEST_TEMPLATE.md
--- a/.github.backup/actionlint.yaml
+++ b/.github.backup/actionlint.yaml
--- a/.github.backup/dependabot.yml
+++ b/.github.backup/dependabot.yml
--- a/.github.backup/format_pr_body.sh
+++ b/.github.backup/format_pr_body.sh
--- a/.github.backup/labeler.yml
+++ b/.github.backup/labeler.yml
--- a/.github.backup/workflows/_accuracy_test.yaml
+++ b/.github.backup/workflows/_accuracy_test.yaml
@@ -30,7 +30,7 @@ jobs:
    runs-on: ${{ inputs.runner }}
    name: ${{ inputs.model_name }} accuracy
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      env:
        VLLM_USE_MODELSCOPE: True
        # 1. If version specified (work_dispatch), do specified branch accuracy test
--- a/.github.backup/workflows/_e2e_test.yaml
+++ b/.github.backup/workflows/_e2e_test.yaml
@@ -89,6 +89,7 @@ jobs:
          # the test separately.

          pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
          pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
          pytest -sv tests/e2e/singlecard/test_bge_model.py
          pytest -sv tests/e2e/singlecard/test_camem.py
@@ -105,8 +106,8 @@ jobs:
          # ------------------------------------ v1 spec decode test ------------------------------------ #
          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
-          # Fix me: OOM error
-          #pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+          # Fix me: test_eagle_correctness OOM error
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py

          pytest -sv tests/e2e/singlecard/ops/

--- a/.github.backup/workflows/accuracy_test.yaml
+++ b/.github.backup/workflows/accuracy_test.yaml
@@ -68,5 +68,5 @@ jobs:
    with:
      vllm: v0.11.0
      runner:  linux-aarch64-${{ matrix.runner }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      model_name: ${{ matrix.model_name }}
--- a/.github.backup/workflows/format_pr_body.yaml
+++ b/.github.backup/workflows/format_pr_body.yaml
--- a/.github.backup/workflows/image_310p_openeuler.yml
+++ b/.github.backup/workflows/image_310p_openeuler.yml
--- a/.github.backup/workflows/image_310p_ubuntu.yml
+++ b/.github.backup/workflows/image_310p_ubuntu.yml
--- a/.github.backup/workflows/image_a3_openeuler.yml
+++ b/.github.backup/workflows/image_a3_openeuler.yml
--- a/.github.backup/workflows/image_a3_ubuntu.yml
+++ b/.github.backup/workflows/image_a3_ubuntu.yml
--- a/.github.backup/workflows/image_openeuler.yml
+++ b/.github.backup/workflows/image_openeuler.yml
--- a/.github.backup/workflows/image_ubuntu.yml
+++ b/.github.backup/workflows/image_ubuntu.yml
--- a/.github.backup/workflows/label_merge_conflict.yml
+++ b/.github.backup/workflows/label_merge_conflict.yml
--- a/.github.backup/workflows/labeler.yml
+++ b/.github.backup/workflows/labeler.yml
--- a/.github.backup/workflows/matchers/actionlint.json
+++ b/.github.backup/workflows/matchers/actionlint.json
--- a/.github.backup/workflows/matchers/mypy.json
+++ b/.github.backup/workflows/matchers/mypy.json
--- a/.github.backup/workflows/matchers/ruff.json
+++ b/.github.backup/workflows/matchers/ruff.json
--- a/.github.backup/workflows/multi_node_test.yaml
+++ b/.github.backup/workflows/multi_node_test.yaml
@@ -23,7 +23,7 @@ jobs:
    # This is a runner with no NPU for k8s controller
    runs-on: linux-aarch64-a3-0
    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
      env:
        KUBECONFIG: /tmp/kubeconfig
        KUBECTL: /root/.cache/.kube/kubectl
--- a/.github.backup/workflows/nightly_benchmarks.yaml
+++ b/.github.backup/workflows/nightly_benchmarks.yaml
@@ -56,7 +56,7 @@ jobs:
            vllm_use_v1: 1
      max-parallel: 1
    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      volumes:
        - /usr/local/dcmi:/usr/local/dcmi
        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
--- a/.github.backup/workflows/pre-commit.yml
+++ b/.github.backup/workflows/pre-commit.yml
--- a/.github.backup/workflows/release_code.yml
+++ b/.github.backup/workflows/release_code.yml
--- a/.github.backup/workflows/release_whl.yml
+++ b/.github.backup/workflows/release_whl.yml
@@ -57,7 +57,13 @@ jobs:
    - name: Print
      run: |
        lscpu
-        
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
    - name: Build wheel
      run: |
        ls
--- a/.github.backup/workflows/reminder_comment.yml
+++ b/.github.backup/workflows/reminder_comment.yml
--- a/.github.backup/workflows/vllm_ascend_dist.yaml
+++ b/.github.backup/workflows/vllm_ascend_dist.yaml
@@ -47,7 +47,7 @@ jobs:
    name: vLLM Ascend test
    runs-on: ${{ matrix.os }}
    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
      env:
        DEBIAN_FRONTEND: noninteractive
    steps:
@@ -97,4 +97,4 @@ jobs:
          VLLM_USE_MODELSCOPE: True
        run: |
          # TODO: enable more tests
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
--- a/.github.backup/workflows/vllm_ascend_doctest.yaml
+++ b/.github.backup/workflows/vllm_ascend_doctest.yaml
--- a/.github.backup/workflows/vllm_ascend_test.yaml
+++ b/.github.backup/workflows/vllm_ascend_test.yaml
@@ -75,7 +75,7 @@ jobs:
    name: unit test
    # only trigger unit test after lint passed and the change is e2e and ut related.
    if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04-arm
    container:
      image: quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
      env:
@@ -100,7 +100,7 @@ jobs:
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
-          VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
+          VLLM_TARGET_DEVICE=empty python3 -m pip install .
          python3 -m pip uninstall -y triton

      - name: Checkout vllm-project/vllm-ascend repo
@@ -109,18 +109,18 @@ jobs:
      - name: Install vllm-project/vllm-ascend
        run: |
          export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
-          python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/devlib
+          python3 -m pip install -r requirements-dev.txt
+          python3 -m pip install -v .

      - name: Run unit test
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          TORCH_DEVICE_BACKEND_AUTOLOAD: 0
        run: |
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut 
-
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/devlib
+          pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut \
+            --ignore tests/ut/attention/test_attention_v1.py
      - name: Upload coverage to Codecov
        # only upload coverage when commits merged
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
@@ -145,5 +145,5 @@ jobs:
    with:
      vllm: ${{ matrix.vllm_version }}
      runner: linux-aarch64-a2
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      type: light
--- a/.github.backup/workflows/vllm_ascend_test_310p.yaml
+++ b/.github.backup/workflows/vllm_ascend_test_310p.yaml
@@ -58,7 +58,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    container:
      # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-310p-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-310p-ubuntu22.04-py3.11
      env:
        VLLM_LOGGING_LEVEL: ERROR
        VLLM_USE_MODELSCOPE: True
--- a/.github.backup/workflows/vllm_ascend_test_full.yaml
+++ b/.github.backup/workflows/vllm_ascend_test_full.yaml
@@ -76,5 +76,5 @@ jobs:
    with:
      vllm: ${{ matrix.vllm_version }}
      runner: linux-aarch64-a2
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      type: full
--- a/.github.backup/workflows/vllm_ascend_test_full_vllm_main.yaml
+++ b/.github.backup/workflows/vllm_ascend_test_full_vllm_main.yaml
@@ -41,5 +41,5 @@ jobs:
    with:
      vllm: main
      runner: linux-aarch64-a2
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      type: full
--- a/.github.backup/workflows/vllm_ascend_test_models.yaml
+++ b/.github.backup/workflows/vllm_ascend_test_models.yaml
@@ -79,7 +79,7 @@ jobs:
    with:
      vllm: v0.11.0
      runner:  linux-aarch64-${{ matrix.runner }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      model_name: ${{ matrix.model_name }}
      upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}

--- a/.github.backup/workflows/vllm_ascend_test_pd.yaml
+++ b/.github.backup/workflows/vllm_ascend_test_pd.yaml
@@ -49,7 +49,7 @@ jobs:
    runs-on: linux-arm64-npu-static-8

    container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      volumes:
        - /usr/local/dcmi:/usr/local/dcmi
        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
@@ -109,4 +109,4 @@ jobs:
      - name: Run vllm-project/vllm-ascend PD Disaggregation edge test
        run: |
          git config --global --add safe.directory/__w/vllm-ascend/vllm-ascend
-          bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
+          bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
--- a/.github/workflows/_e2e_nightly.yaml
+++ b/.github/workflows/_e2e_nightly.yaml
@@ -1,115 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-name: 'e2e nightly test'
-
-on:
-  workflow_call:
-    inputs:
-      vllm:
-        required: true
-        type: string
-      runner:
-        required: true
-        type: string
-      image:
-        required: false
-        type: string
-        default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
-      tests:
-        required: true
-        type: string
-
-# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
-# declared as "shell: bash -el {0}" on steps that need to be properly activated.
-# It's used to activate ascend-toolkit environment variables.
-defaults:
-  run:
-    shell: bash -el {0}
-
-# only cancel in-progress runs of the same workflow
-# and ignore the lint / 1 card / 4 cards test type
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  e2e-nightly:
-    name: e2e-nightly
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.image }}
-      env:
-        VLLM_USE_MODELSCOPE: True
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-          apt-get update -y
-          apt install git -y
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        run: |
-          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: ${{ inputs.vllm }}
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        env:
-          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
-
-      - name: Checkout aisbench repo and Install aisbench
-        run: |
-          git clone https://gitee.com/aisbench/benchmark.git
-          cd benchmark
-          git checkout v3.0-20250930-master
-          pip3 install -e ./
-          pip3 install -r requirements/api.txt
-          pip3 install -r requirements/extra.txt
-
-      - name: Run vllm-project/vllm-ascend test
-        env:
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          VLLM_USE_MODELSCOPE: True
-          VLLM_CI_RUNNER: ${{ inputs.runner }}
-        run: |
-          # TODO: enable more tests
-          pytest -sv ${{ inputs.tests }}
--- a/.github/workflows/vllm_ascend_test_nightly.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly.yaml
@@ -1,86 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-name: 'ascend test / nightly'
-
-on:
-  schedule:
-      # Run test at 24:00 Beijing time (UTC+8)
-      - cron: "0 16 * * *"
-  workflow_dispatch:
-  pull_request: 
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - 'tests/e2e/nightly/**'
-      - '.github/workflows/vllm_ascend_test_nightly.yaml'
-
-# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
-# declared as "shell: bash -el {0}" on steps that need to be properly activated.
-# It's used to activate ascend-toolkit environment variables.
-defaults:
-  run:
-    shell: bash -el {0}
-
-# only cancel in-progress runs of the same workflow
-# and ignore the lint / 1 card / 4 cards test type
-concurrency:
-  group: ascend-nightly-${{ github.ref }}
-  #cancel-in-progress: true
-
-jobs:
-  qwen3-32b:
-    strategy:
-      matrix:
-        # should add A3 chip runner when available
-        os: [linux-aarch64-a2-4]
-    # Note (yikun): If CI resource are limited we can split job into two chain jobs
-    # only trigger e2e test after lint passed and the change is e2e related with pull request.
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      tests: tests/e2e/nightly/models/test_qwen3_32b.py
-  qwen3-235b-a22b-w8a8-eplb:
-    strategy:
-      matrix:
-        # should add A3 chip runner when available
-        os: [ linux-aarch64-a3-16 ]
-    # Note (yikun): If CI resource are limited we can split job into two chain jobs
-    # only trigger e2e test after lint passed and the change is e2e related with pull request.
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
-  deepseek-r1-w8a8-eplb:
-    strategy:
-      matrix:
-        # should add A3 chip runner when available
-        os: [ linux-aarch64-a3-16 ]
-    # Note (yikun): If CI resource are limited we can split job into two chain jobs
-    # only trigger e2e test after lint passed and the change is e2e related with pull request.
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
-
-
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,8 +12,8 @@ repos:
    - id: codespell
      args: [
        --toml, pyproject.toml,
-        '--skip', 'tests/e2e/multicard/test_torchair_graph_mode.py,csrc/mla_preprocess/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml',
-        '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND'
+        '--skip', 'tests/e2e/multicard/test_torchair_graph_mode.py,csrc/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml',
+        '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND,ND'
      ]
      additional_dependencies:
        - tomli
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,13 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")

 find_package(Torch REQUIRED)

+run_python(TORCH_VERSION
+  "import torch; print(torch.__version__)" "Failed to locate torch path")
+# check torch version is 2.7.1
+if(NOT ${TORCH_VERSION} VERSION_EQUAL "2.7.1")
+  message(FATAL_ERROR "Expected PyTorch version 2.7.1, but found ${TORCH_VERSION}")
+endif()
+
 set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
 set(SOC_VERSION ${SOC_VERSION})
 message(STATUS "Detected SOC version: ${SOC_VERSION}")
@@ -48,15 +55,35 @@ include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 file(GLOB KERNEL_FILES
 ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp)

-ascendc_library(vllm_ascend_kernels SHARED
+set(VLLM_ASCEND_CUSTOM_OP
    ${KERNEL_FILES}
    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
+)
+
+set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
+)
+
+if(SOC_VERSION STREQUAL "ASCEND310P3")
+    list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE})
+endif()
+
+ascendc_library(vllm_ascend_kernels SHARED
+    ${VLLM_ASCEND_CUSTOM_OP}
 )

 message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")

-file(GLOB VLLM_ASCEND_SRC
-${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
+if(SOC_VERSION STREQUAL "ASCEND310P3")
+    file(GLOB VLLM_ASCEND_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
+else()
+    file(GLOB VLLM_ASCEND_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
+endif()

 include_directories(
  ${pybind11_INCLUDE_DIRS}
@@ -66,6 +93,8 @@ include_directories(
  ${ASCEND_HOME_PATH}/include
  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
  ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
+  ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
+  ${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include
 )

 set(
--- a/31
+++ b/31
@@ -15,24 +15,33 @@
 # This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}

-RUN apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
-
 WORKDIR /workspace

 COPY . /vllm-workspace/vllm-ascend/

+# Install Mooncake dependencies
+RUN apt-get update -y && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
 RUN pip config set global.index-url ${PIP_INDEX_URL}

 # Install vLLM
@@ -40,7 +49,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

@@ -50,11 +59,17 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
+
 # Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

 CMD ["/bin/bash"]
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -40,7 +40,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
@@ -38,7 +38,7 @@ ARG VLLM_TAG=v0.11.0

 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -15,32 +15,40 @@
 # This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG=v0.3.7.post2

+COPY . /vllm-workspace/vllm-ascend/
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}

-RUN apt-get update -y && \
-    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
-    rm -rf /var/cache/apt/* && \
-    rm -rf /var/lib/apt/lists/*
+RUN pip config set global.index-url ${PIP_INDEX_URL}

 WORKDIR /workspace

-COPY . /vllm-workspace/vllm-ascend/
-
-RUN pip config set global.index-url ${PIP_INDEX_URL}
+# Install Mooncake dependencies
+RUN apt-get update -y && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*

 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

@@ -54,7 +62,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip cache purge

 # Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -15,30 +15,43 @@
 # This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"

 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}

-RUN yum update -y && \
-    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
-    rm -rf /var/cache/yum
-
 RUN pip config set global.index-url ${PIP_INDEX_URL}

 WORKDIR /workspace

 COPY . /vllm-workspace/vllm-ascend/

+SHELL ["/bin/bash", "-c"]
+
+RUN yum update -y && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    ARCH=$(uname -m) && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
+    cd /vllm-workspace/Mooncake && \
+    bash mooncake_installer.sh -y && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/yum/*
+
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
-
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

@@ -52,7 +65,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip cache purge

 # Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
--- a/Dockerfile.backup
+++ b/Dockerfile.backup
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install Mooncake dependencies
+RUN apt-get update -y && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -15,16 +15,14 @@
 # This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
+FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"

 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}

-RUN yum update -y && \
-    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
-    rm -rf /var/cache/yum

 RUN pip config set global.index-url ${PIP_INDEX_URL}

@@ -32,13 +30,29 @@ WORKDIR /workspace

 COPY . /vllm-workspace/vllm-ascend/

+SHELL ["/bin/bash", "-c"]
+
+RUN yum update -y && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    ARCH=$(uname -m) && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
+    cd /vllm-workspace/Mooncake && \
+    bash mooncake_installer.sh -y && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/yum/*
+
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.11.0
-
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge

@@ -52,7 +66,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    python3 -m pip cache purge

 # Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge

 CMD ["/bin/bash"]
--- a/README-vllm-ascend.md
+++ b/README-vllm-ascend.md
@@ -0,0 +1,91 @@
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
+    <img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
+  </picture>
+</p>
+
+<h3 align="center">
+vLLM Ascend Plugin
+</h3>
+
+<p align="center">
+| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
+</p>
+
+<p align="center">
+<a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
+</p>
+
+---
+*Latest News* 🔥
+- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
+- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
+- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
+- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
+- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
+- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
+- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
+- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
+---
+## Overview
+
+vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
+
+It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
+
+By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
+
+## Prerequisites
+
+- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
+- OS: Linux
+- Software:
+  * Python >= 3.9, < 3.12
+  * CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
+  * PyTorch == 2.7.1, torch-npu == 2.7.1
+  * vLLM (the same version as vllm-ascend)
+
+## Getting Started
+
+Please use the following recommended versions to get started quickly:
+
+| Version    | Release type | Doc                                  |
+|------------|--------------|--------------------------------------|
+|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
+|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
+
+## Contributing
+See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
+
+We welcome and value any contributions and collaborations:
+- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
+- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
+
+## Branch
+
+vllm-ascend has main branch and dev branch.
+
+- **main**: main branch，corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
+- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
+
+Below is maintained branches:
+
+| Branch     | Status       | Note                                 |
+|------------|--------------|--------------------------------------|
+| main       | Maintained   | CI commitment for vLLM main branch and vLLM v0.11.0 tag   |
+| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
+| v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
+| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
+| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
+
+Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
+
+## Weekly Meeting
+
+- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
+- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
+
+## License
+
+Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
--- a/README.md
+++ b/README.md
@@ -1,90 +1,50 @@
-<p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
-    <img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
-  </picture>
-</p>
+# vLLM-Ascend Multi-LLM Serving Support

-<h3 align="center">
-vLLM Ascend Plugin
-</h3>
-
-<p align="center">
-| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
-</p>
-
-<p align="center">
-<a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
-</p>
-
---
-*Latest News* 🔥
- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
---
 ## Overview

-vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
+This repository is a modified version of [vLLM-Ascend](https://github.com/vllm-project/vllm-ascend) designed to enable multiple large language models (LLMs) to share one Ascend NPU.

-It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
+The key feature of this project is efficient memory coordination, enabling multiple vLLM instances share and dynamically hold Ascend NPU's physical memory.
+When an instance is idle, model parameters are offloaded to host memory.
+Upon a new inference request, the model parameters are quickly restored to the NPU’s memory (if not exist), without the need to init the engine and load the model from scratch. (For Qwen3-8B, It only causes 0.3s of additional latency to TTFT on a real restore.)

-By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.

-## Prerequisites
+## Features

- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
- OS: Linux
- Software:
-  * Python >= 3.9, < 3.12
-  * CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
-  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
-  * vLLM (the same version as vllm-ascend)
+- **Shared NPU Usage**: Multiple vLLM instances can access the same Ascend NPU, allowing for multi-LLM serving of different LLMs.
+- **Fast Memory Restore**: We decouple the virtual and physical memory allcation. Physical NPU memory is allocated and exported and shared to other LLM engines. LLM engines can restore quickly without reinitialize and memory allocation

-## Getting Started

-Please use the following recommended versions to get started quickly:
+## Installation

-| Version    | Release type | Doc                                  |
-|------------|--------------|--------------------------------------|
-|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
-|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
+### Build from Dockerfile

-## Contributing
-See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
+Clone this repository:

-We welcome and value any contributions and collaborations:
- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
+```bash
+docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
+```

-## Branch
+## Usage

-vllm-ascend has main branch and dev branch.
+> [!NOTE]
+> Some platforms may not allow multiple containers to share the same Ascend NPU. You may try to use privilegd container to bypass this restriction and mount all NPUs, and set the env ASCEND_RT_VISIBLE_DEVICES to specify the target device to use.

- **main**: main branch，corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
+0. To share NPU, processes coordinate via shm, so you need to set all containers with `ipc=host`.
+1. Start a daemon process in a standalone container, by running `vllm_vnpu_daemon` installed inside the image.
+2. Start LLM services with this image, following the official usage instructions.

-Below is maintained branches:

-| Branch     | Status       | Note                                 |
-|------------|--------------|--------------------------------------|
-| main       | Maintained   | CI commitment for vLLM main branch and vLLM v0.11.0 tag   |
-| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
-| v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
-| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
-| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
+## Limitations

-Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
+- This project only support share a single NPU currently. This is also limited by the fact that HCCL cannot be shared. We haven't figure out how to bypass HCCL. *Help wanted*.
+- The prefix cache will be reset when the LLM is restored, since we just simply discard the KV cache when the LLM is offloaded.

-## Weekly Meeting

- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
+## Roadmap
+- [ ] Space-sharing.
+- [ ] ...
+

 ## License

--- a/README.zh.md
+++ b/README.zh.md
@@ -43,8 +43,8 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
 - 操作系统：Linux
 - 软件：
  * Python >= 3.9, < 3.12
-  * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
-  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
+  * CANN >= 8.3.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
+  * PyTorch == 2.7.1, torch-npu == 2.7.1
  * vLLM (与vllm-ascend版本一致)

 ## 开始使用
--- a/csrc/batch_matmul_transpose/op_host/batch_matmul_transpose.h
+++ b/csrc/batch_matmul_transpose/op_host/batch_matmul_transpose.h
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <string>
+#include "acl/acl.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/tiling_data.h"
+#include "common_tiling.h"
+
+
+namespace bmm_trans {
+using namespace pp_matmul;
+
+std::unordered_map<c10::string_view, uint16_t> quantModeMap = {
+    {"per_channel_symm", 0},
+    {"per_channel_asymm", 1},
+    {"per_token_symm", 2},
+};
+
+std::unordered_map<c10::string_view, uint16_t> formatModeMap = {
+    {"ND", 0},
+    {"NZ", 1},
+};
+
+std::unordered_map<c10::ScalarType, TensorDType> atType2tensorDType = {
+    {at::ScalarType::BFloat16, TensorDType::TENSOR_DTYPE_BF16},
+    {at::ScalarType::Half, TensorDType::TENSOR_DTYPE_FLOAT16}};
+
+// batch size -> memory index
+constexpr uint32_t MAX_CAPTURE_NUM = 1024;
+
+template <typename MapType>
+inline int GetModeVal(const MapType &mode_map, c10::optional<c10::string_view> mode_opt, c10::string_view default_mode,
+                      const char *mode_name)
+{
+    std::string modeStr(mode_name);
+    c10::string_view mode_str = mode_opt.value_or(default_mode);
+    auto it = mode_map.find(mode_str);
+    // if input mode is unsupported, use default value
+    TORCH_CHECK(it != mode_map.end(), modeStr, c10::str(": Unsupported mode value ", mode_str));
+    return it->second;
+}
+
+std::tuple<at::Tensor, uint32_t> batch_matmul_transpose_tiling(const at::Tensor &tensor_a, const at::Tensor &tensor_b, at::Tensor &tensor_c,
+                                     c10::optional<c10::string_view> format_mode,
+                                     c10::optional<c10::string_view> quant_mode)
+{
+    auto tensorAShape = tensor_a.sizes();
+    auto tensorBShape = tensor_b.sizes();
+    auto tensorCShape = tensor_c.sizes();
+    uint32_t n;
+    uint32_t block_dim;
+
+    //auto &platform = PlatformInfo::Instance();
+    HardwareInfo hwInfo;
+    std::map<c10::ScalarType, float> dTypeMap = {{at::ScalarType::Half, 2.0}, {at::ScalarType::BFloat16, 2.0}};
+
+    at::ScalarType aType = tensor_a.scalar_type();
+    at::ScalarType bType = tensor_b.scalar_type();
+    at::ScalarType cType = tensor_c.scalar_type();
+    TORCH_CHECK(aType == bType && bType == cType, "tensor type is not the same");
+    TORCH_CHECK((aType == at::ScalarType::BFloat16) || (aType == at::ScalarType::Half),
+                "tensor type only support half or bf16");
+
+    TensorFormat formatMode = static_cast<TensorFormat>(GetModeVal(formatModeMap, format_mode, "ND", "format_mode"));
+    MatMul::QuantMode quantMode =
+        static_cast<MatMul::QuantMode>(GetModeVal(quantModeMap, quant_mode, "per_channel_symm", "quant_mode"));
+
+    TORCH_CHECK(tensorAShape.size() == 3, "batch size is not same between srcTensor and dstTensor");
+    if (formatMode == TensorFormat::TENSOR_FORMAT_ND) {
+        TORCH_CHECK(tensorBShape.size() == 3, "tensor shape should be dim3 in ND format");
+        TORCH_CHECK(tensorAShape[2] == tensorBShape[1], "tensor shape is wrong");
+        n = tensorBShape[2];
+    } else {
+        TORCH_CHECK(tensorBShape.size() == 4, "tensor shape should be dim4 in nz format");
+        TORCH_CHECK(tensorAShape[2] == tensorBShape[2], "tensor shape is wrong");
+        n = tensorBShape[1] * tensorBShape[3];
+    }
+    TORCH_CHECK(tensorAShape[1] == tensorBShape[0], "tensor shape is wrong");
+
+    OpShape opShape = {.batchSize = static_cast<uint32_t>(tensorAShape[1]),
+                       .m = static_cast<uint32_t>(tensorAShape[0]),
+                       .k = static_cast<uint32_t>(tensorAShape[2]),
+                       .n = n};
+    pp_matmul::PpMatmulTilingData matmulTilingData = {
+        .opShape = opShape,
+    };
+    auto dType = atType2tensorDType[aType];
+    MatMulInfo mmInfo = {.batchSize = opShape.batchSize,
+                         .m = opShape.m,
+                         .k = opShape.k,
+                         .n = opShape.n,
+                         .dtypeA = dType,
+                         .dtypeB = dType,
+                         .dtypeC = dType,
+                         .formatB = formatMode,
+                         .mmType = MatMul::MatMulType::MATMUL_EIN_SUM,
+                         .inDtype = dTypeMap[aType],
+                         .outDtype = dTypeMap[cType],
+                         .quantMode = quantMode};
+    GetPpMatmulTiling(mmInfo, hwInfo, block_dim, matmulTilingData);
+    host_utils::PpMatmulTilingCheck(matmulTilingData);
+
+    // tiling
+    int32_t batchIdx = opShape.m - 1;
+    uint32_t tilingSize = sizeof(pp_matmul::PpMatmulTilingData);
+    static auto global_tiling_data = at::empty(
+        {tilingSize * MAX_CAPTURE_NUM}, at::TensorOptions().dtype(at::kByte).device(tensor_a.options().device()));
+    if (batchIdx >= 0 && batchIdx < MAX_CAPTURE_NUM) {
+        aclrtMemcpy(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, &matmulTilingData,
+                    tilingSize, ACL_MEMCPY_HOST_TO_DEVICE);
+    } else {
+        // Handle the case where batchIdx is out of range
+        TORCH_CHECK(false, "batchIdx is out of range: ", batchIdx);
+    }
+    at::Tensor tiling_tensor =
+        at::from_blob(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, at::kByte);
+
+    return std::make_tuple(tiling_tensor, block_dim);
+
+}
+
+}
+
--- a/csrc/batch_matmul_transpose/op_host/common.h
+++ b/csrc/batch_matmul_transpose/op_host/common.h
@@ -0,0 +1,57 @@
+
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTILS_COMMON_H
+#define UTILS_COMMON_H
+
+namespace host_utils {
+
+constexpr uint32_t BLK_SIZE_ALIN_FOR_INT64 = 4;
+constexpr uint32_t BLK_SIZE_ALIN_FOR_INT32 = 8;
+
+inline uint64_t alinInt64Count(uint64_t count)
+{
+    return (count + BLK_SIZE_ALIN_FOR_INT64 - 1) / BLK_SIZE_ALIN_FOR_INT64 * BLK_SIZE_ALIN_FOR_INT64;
+}
+
+inline uint64_t alinInt32Count(uint64_t count)
+{
+    return (count + BLK_SIZE_ALIN_FOR_INT32 - 1) / BLK_SIZE_ALIN_FOR_INT32 * BLK_SIZE_ALIN_FOR_INT32;
+}
+
+template <typename T>
+inline T CeilDiv(const T dividend, const T divisor)
+{
+    if (divisor == 0) {
+        return UINT32_MAX;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+template <typename T>
+inline T RoundUp(const T val, const T align = 16)
+{
+    if (align == 0 || val + align - 1 < val) {
+        return 0;
+    }
+    return (val + align - 1) / align * align;
+}
+
+template <typename T>
+inline T RoundDown(const T val, const T align = 16)
+{
+    if (align == 0) {
+        return 0;
+    }
+    return val / align * align;
+}
+}  // namespace host_utils
+#endif  // UTILS_COMMON_H
--- a/csrc/batch_matmul_transpose/op_host/common_tiling.h
+++ b/csrc/batch_matmul_transpose/op_host/common_tiling.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef COMMMON_TILING_H
+#define COMMMON_TILING_H
+
+#include <iostream>
+#include <cmath>
+#include "common.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace host_utils {
+
+constexpr uint32_t FP16_SIZE = 2;
+constexpr uint32_t FP32_SIZE = 4;
+constexpr uint32_t BLOCK_SIZE = 16;
+constexpr uint32_t BLOCK_SIZE_INT8_K = 32;
+constexpr uint32_t BASE_BLOCK_STEP = 2;
+constexpr uint32_t AXES_ALIGN_SIZE = 512;
+constexpr uint32_t AXES_ALIGN_SIZE_INT8 = 256;
+constexpr uint32_t ND_SHAPE_SIZE = 2;
+constexpr uint32_t NZ_SHAPE_SIZE = 4;
+constexpr uint32_t CUBE_BLOCK_SIZE = 256;
+constexpr uint32_t CUBE_BLOCK_SIZE_INT8 = 512;
+constexpr uint32_t L1AB_PINGPONG_BUFFER_LEN = 262144;
+constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 131072 * 2;  // 256 KB
+constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 131072;      // 128 KB
+constexpr uint32_t L1AB_PINGPONG_BUFFER_LEN_INT8_SPARSE = 160 * 1024;
+constexpr uint32_t UB_LIMIT_SIZE_910A = 128 * 1024;
+
+enum class PlatformType { ASCEND_310P, ASCEND_910A, ASCEND_910B, ASCEND_910C, PLATFORM_INVALID };
+
+struct PlatformInfo {
+public:
+    static const PlatformInfo &Instance()
+    {
+        static PlatformInfo platformInfo;
+        return platformInfo;
+    }
+
+    PlatformType socType;
+    uint32_t coreNum;
+    uint32_t coreNumAic;
+    uint32_t coreNumAiv;
+    uint64_t ubSize;
+    uint64_t l1Size;
+    uint64_t l2Size;
+    uint64_t l0aSize;
+    uint64_t l0bSize;
+    uint64_t l0cSize;
+
+private:
+    PlatformInfo()
+    {
+        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+        // TODO Hard coding set to 910_93xx, parse using aclrtGetSocName is better
+        socType = PlatformType::ASCEND_910C;
+        coreNum = ascendcPlatform->GetCoreNum();
+        coreNumAic = ascendcPlatform->GetCoreNumAic();
+        coreNumAiv = ascendcPlatform->GetCoreNumAiv();
+        ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
+        ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1Size);
+        ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L2, l2Size);
+        ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0aSize);
+        ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0bSize);
+        ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0cSize);
+    }
+
+    PlatformInfo(const PlatformInfo &) = delete;
+    PlatformInfo &operator=(const PlatformInfo &) = delete;
+    PlatformInfo(PlatformInfo &&) = delete;
+    PlatformInfo &operator=(PlatformInfo &&) = delete;
+};
+
+inline __attribute__((always_inline)) uint32_t GetN0TilingLimit(bool compressFlag, uint32_t tilingN,
+                                                                const PlatformType &platformType)
+{
+    if (compressFlag) {
+        return std::min(tilingN * BLOCK_SIZE, AXES_ALIGN_SIZE_INT8);
+    } else {
+        return (platformType == PlatformType::ASCEND_310P || platformType == PlatformType::ASCEND_910A)
+                   ? AXES_ALIGN_SIZE
+                   : AXES_ALIGN_SIZE_INT8;
+    }
+}
+
+template <typename OpShareType>
+inline __attribute__((always_inline)) uint32_t GetN0TilingInit(const OpShareType &opShape, bool compressFlag,
+                                                               uint32_t tilingN)
+{
+    const uint32_t rnd = 16;
+    return compressFlag
+               ? ((tilingN * BLOCK_SIZE > opShape.n) ? RoundUp<uint32_t>(opShape.n, rnd) : tilingN * BLOCK_SIZE)
+               : BLOCK_SIZE;
+}
+
+template <bool PRI_FLAG>
+inline __attribute__((always_inline)) bool IsExceedTilingLimit(uint32_t axes0, uint32_t priAxes0,
+                                                               uint32_t n0TilingLimit, PlatformType platformType,
+                                                               uint32_t basicBlockSize)
+{
+    return (PRI_FLAG && axes0 > n0TilingLimit) || (!PRI_FLAG && priAxes0 > n0TilingLimit) ||
+           (platformType == PlatformType::ASCEND_910A && basicBlockSize > UB_LIMIT_SIZE_910A);
+}
+
+template <bool PRI_FLAG, typename OpShareType>
+inline __attribute__((always_inline)) void SetOpShapeAxesInfo(OpShareType &opShape, uint32_t priAxes0, uint32_t axes0)
+{
+    opShape.m0 = PRI_FLAG ? priAxes0 : axes0;
+    opShape.n0 = PRI_FLAG ? axes0 : priAxes0;
+}
+
+template <typename HardwareType, typename OpShapeType>
+inline __attribute__((always_inline)) float CostFunc(const HardwareType &hwInfor, OpShapeType &shape)
+{
+    float aCoef = 1;
+    float bCoef = 1;
+    float bwCoef = static_cast<float>(hwInfor.l2BandWidth) / static_cast<float>(hwInfor.hbmBandWidth);
+    uint32_t mLoop = CeilDiv(shape.m, shape.m0);
+    uint32_t nLoop = CeilDiv(shape.n, shape.n0);
+    if (mLoop == 0 || nLoop == 0) {
+        return 1;
+    }
+    uint32_t coreNeed = shape.batchSize * mLoop * nLoop;
+    uint32_t blockDim = std::min(coreNeed, hwInfor.coreNum);
+    uint32_t mOnce = blockDim < nLoop ? shape.m0 : blockDim / nLoop * shape.m0;
+    uint32_t nOnce = blockDim < nLoop ? hwInfor.coreNum * shape.n0 : shape.n;
+    if (mOnce * shape.k * FP16_SIZE > hwInfor.l2Size) {
+        aCoef = bwCoef;
+    }
+    if (nOnce * shape.k * FP16_SIZE > hwInfor.l2Size) {
+        bCoef = bwCoef;
+    }
+    return 1 / (aCoef * static_cast<float>(shape.n0)) + 1 / (bCoef * static_cast<float>(shape.m0));
+}
+
+template <bool PRI_FLAG, typename OpShareType, typename TilingType, typename HardwareType, typename MatMulInfoType>
+void TilingFunc(OpShareType &opShape, TilingType &tilingParam, const HardwareType &hwInfor,
+                const MatMulInfoType &mmInfo, bool compressFlag = false, const uint32_t tilingN = 1)
+{
+    float costMin = 1;
+    const float CONST_2 = 2.0;
+    const uint32_t ROUND_CONST_16 = 16;
+    uint32_t roundBase = static_cast<uint32_t>(
+        pow(2, ceil(log(CeilDiv(PRI_FLAG ? opShape.n : opShape.m, ROUND_CONST_16)))) * ROUND_CONST_16);
+    uint32_t priAxes = RoundUp<uint32_t>(PRI_FLAG ? opShape.m : opShape.n, ROUND_CONST_16);
+    uint32_t axes = RoundUp<uint32_t>(PRI_FLAG ? opShape.n : opShape.m, roundBase);
+    float axes0Max = static_cast<float>(AXES_ALIGN_SIZE) / mmInfo.inDtype;
+    auto platformType = PlatformInfo::Instance().socType;
+    if (mmInfo.isInt8 && (platformType == PlatformType::ASCEND_310P || platformType == PlatformType::ASCEND_910A)) {
+        axes0Max /= CONST_2;
+    }
+
+    uint32_t n0TilingInit = GetN0TilingInit(opShape, compressFlag, tilingN);
+    uint32_t n0TilingLimit = GetN0TilingLimit(compressFlag, tilingN, platformType);
+    uint32_t priAxes0Init = PRI_FLAG ? BLOCK_SIZE : n0TilingInit;
+    uint32_t axes0Init = PRI_FLAG ? n0TilingInit : BLOCK_SIZE;
+    for (uint32_t priAxes0 = priAxes0Init; priAxes0 <= priAxes && priAxes0 <= axes0Max; priAxes0 *= BASE_BLOCK_STEP) {
+        for (uint32_t axes0 = axes0Init; axes0 <= axes && axes0 <= axes0Max; axes0 *= BASE_BLOCK_STEP) {
+            uint32_t basicBlockSize = priAxes0 * axes0 * FP32_SIZE;
+            if (basicBlockSize > hwInfor.l0cSize) {
+                continue;
+            }
+            if (mmInfo.isInt8 &&
+                IsExceedTilingLimit<PRI_FLAG>(axes0, priAxes0, n0TilingLimit, platformType, basicBlockSize)) {
+                continue;
+            }
+            SetOpShapeAxesInfo<PRI_FLAG>(opShape, priAxes0, axes0);
+            float cost = CostFunc<HardwareType, OpShareType>(hwInfor, opShape);
+            if (cost >= costMin) {
+                continue;
+            }
+            costMin = cost;
+            if constexpr (std::is_same<TilingType, pp_matmul::PpMatmulTilingData>::value) {
+                tilingParam.SetBaseOp(hwInfor.coreNum, opShape.m0, opShape.n0, mmInfo);
+            } else {
+                tilingParam.SetBaseOp(hwInfor.coreNum, opShape.m0, opShape.n0);
+            }
+        }
+    }
+}
+
+template <typename PpTilingDataType>
+uint32_t Swizzl(PpTilingDataType &tilingData)
+{
+    uint32_t swizzlDirect = 0;
+    uint32_t swizzlCount = 1;
+    float m0 = tilingData.opShape.m0;
+    float n0 = tilingData.opShape.n0;
+    float m = tilingData.opShape.m;
+    float k = tilingData.opShape.k;
+    float n = tilingData.opShape.n;
+    float mincost = m * k + k * n;
+
+    for (uint32_t i = 1; i <= tilingData.blockDim; ++i) {
+        int c = static_cast<int32_t>((tilingData.blockDim + i - 1) / i);
+        float cost;
+        // B0 + A < A0 + B
+        if (i * n0 + m < m0 * c + n) {
+            swizzlDirect = 1;  // Nz
+            cost = n0 * i + m0 * c;
+            if (cost <= mincost) {
+                mincost = cost;
+                swizzlCount = i;
+            }
+        } else {
+            swizzlDirect = 0;  // Zn
+            cost = m0 * i + n0 * c;
+            if (cost < mincost) {
+                mincost = cost;
+                swizzlCount = i;
+            }
+        }
+    }
+    tilingData.swizzlDirect = swizzlDirect;
+    tilingData.swizzlCount = swizzlCount;
+    return swizzlDirect;
+}
+
+template <typename PpTilingDataType>
+inline __attribute__((always_inline)) void PpMatmulTilingCheck(const PpTilingDataType &tilingData)
+{
+    TORCH_CHECK(tilingData.opShape.m0 > 0, "m0 is invalid");
+    TORCH_CHECK(tilingData.opShape.k0 > 0, "k0 is invalid");
+    TORCH_CHECK(tilingData.opShape.n0 > 0, "n0 is invalid");
+    TORCH_CHECK(tilingData.mLoop > 0, "mLoop is invalid");
+    TORCH_CHECK(tilingData.kLoop > 0, "kLoop is invalid");
+    TORCH_CHECK(tilingData.nLoop > 0, "nLoop is invalid");
+    TORCH_CHECK(tilingData.blockDim > 0, "nLoop is invalid");
+}
+}  // namespace host_utils
+#endif
--- a/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp
+++ b/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp
@@ -0,0 +1,155 @@
+#include <map>
+#include "tiling_data.h"
+#include "common.h"
+#include "common_tiling.h"
+
+namespace pp_matmul {
+
+constexpr uint32_t L1_DESCALE_BUFFER_LEN_MAX = 6144;
+constexpr uint32_t CONST_3 = 3;
+constexpr uint32_t CONST_4 = 4;
+constexpr uint32_t CONST_16 = 16;
+constexpr uint32_t CONST_32 = 32;
+constexpr uint32_t CONST_256 = 256;
+constexpr uint32_t CONST_512 = 512;
+
+const std::map<TensorDType, uint32_t> G_DTYPE_MAP = {{TensorDType::TENSOR_DTYPE_FLOAT16, 1u},
+                                                     {TensorDType::TENSOR_DTYPE_BF16, 2u}};
+const std::map<TensorFormat, uint32_t> G_FORMAT_MAP = {{TensorFormat::TENSOR_FORMAT_ND, 0u},
+                                                       {TensorFormat::TENSOR_FORMAT_NZ, 1u}};
+using MmType = MatMul::MatMulType;
+using QmType = MatMul::QuantMode;
+using namespace host_utils;
+
+bool IsI8Bf16Kernel(const MatMulInfo &mmInfo)
+{
+    bool isI8Bf16 = mmInfo.isInt8 && mmInfo.dtypeC == TensorDType::TENSOR_DTYPE_BF16;
+    bool isI8Fp16 = mmInfo.isInt8 && mmInfo.dtypeC == TensorDType::TENSOR_DTYPE_FLOAT16 &&
+                    mmInfo.quantMode == QmType::PER_TOKEN_SYMM;
+    return isI8Bf16 || isI8Fp16;
+}
+
+HardwareInfo::HardwareInfo()
+{
+    auto &platform = PlatformInfo::Instance();
+    coreNum = platform.coreNumAic;
+    l2Size = platform.l2Size;
+    l1Size = platform.l1Size;
+    l0aSize = platform.l0aSize;
+    l0bSize = platform.l0bSize;
+    l0cSize = platform.l0cSize;
+    hbmBandWidth = 1;
+    l2BandWidth = 5;  // 5x faster than hbm.
+}
+
+void PpMatmulTilingData::SetBaseShape(uint32_t batchSize, uint32_t m, uint32_t k, uint32_t n)
+{
+    opShape.batchSize = batchSize;
+    opShape.m = m;
+    opShape.k = k;
+    opShape.n = n;
+}
+
+void PpMatmulTilingData::SetBaseOp(uint32_t coreNum, uint32_t mBase, uint32_t nBase, const MatMulInfo &mmInfo)
+{
+    opShape.m0 = mBase;
+    opShape.n0 = nBase;
+    mLoop = CeilDiv(opShape.m, opShape.m0);
+    nLoop = CeilDiv(opShape.n, opShape.n0);
+    coreLoop = opShape.batchSize * mLoop * nLoop;
+
+    if (mLoop == 1 && mmInfo.transB && coreLoop % coreNum < coreNum / CONST_4 * CONST_3) {
+        mBase = RoundUp<uint32_t>(opShape.m, CONST_16);
+        opShape.m0 = mBase;
+        uint32_t maxN0 = PlatformInfo::Instance().l0cSize / (mBase * sizeof(float));
+        if (mmInfo.isInt8 || mmInfo.mmType == MmType::MATMUL_WITH_BIAS) {
+            maxN0 = maxN0 < CONST_256 ? maxN0 : CONST_256;
+        }
+        uint32_t x = CeilDiv(opShape.n, coreNum);
+        uint32_t y = CeilDiv(x, maxN0);
+        nBase = RoundUp<uint32_t>(CeilDiv(x, y), CONST_16);
+        uint32_t rqdL0CSize = mBase * nBase * sizeof(float);
+        if (rqdL0CSize < PlatformInfo::Instance().l0cSize &&
+            (mBase + nBase) * CONST_256 * sizeof(uint16_t) < L1AB_PINGPONG_BUFFER_LEN) {
+            opShape.n0 = nBase;
+            nLoop = CeilDiv(opShape.n, opShape.n0);
+            coreLoop = opShape.batchSize * nLoop;
+        }
+    }
+    blockDim = std::min(coreLoop, coreNum);
+}
+
+// transA transB quantMode [dtype] format
+void PpMatmulTilingData::SetTilingKey(const MatMulInfo &mmInfo, uint32_t swizzleDirect, uint32_t enSplitK)
+{
+    if (mmInfo.mmType == MmType::MATMUL_ACCUM_ATOMIC || mmInfo.mmType == MmType::MATMUL_WITH_BIAS ||
+        mmInfo.mmType == MmType::MATMUL_EIN_SUM || mmInfo.mmType == MmType::MATMUL_DEQUANT || IsI8Bf16Kernel(mmInfo)) {
+        // SwizzleDir[1] TransA[1] TransB[1] DtypeA[3] DtypeB[3] DtypeC[3] FormatA[1] FormatB[1] FormatC[1] WithBias[1]
+        tilingKey = swizzleDirect;
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transA);
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transB);
+        tilingKey = (tilingKey << 3) + G_DTYPE_MAP.at(mmInfo.dtypeA);  // 3bit for dtypeA.
+        tilingKey = (tilingKey << 3) + G_DTYPE_MAP.at(mmInfo.dtypeB);  // 3bit for dtypeB.
+        tilingKey = (tilingKey << 3) + G_DTYPE_MAP.at(mmInfo.dtypeC);  // 3bit for dtypeC.
+        tilingKey = (tilingKey << 1) + G_FORMAT_MAP.at(mmInfo.formatA);
+        tilingKey = (tilingKey << 1) + G_FORMAT_MAP.at(mmInfo.formatB);
+        tilingKey = (tilingKey << 1) + G_FORMAT_MAP.at(mmInfo.formatC);
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.biasFlag);
+    } else {
+        tilingKey = swizzleDirect;
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transA);
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transB);
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.isInt8);
+        tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.biasFlag);
+        tilingKey = (tilingKey << 1) + enSplitK;
+    }
+}
+
+uint32_t PpMatmulTilingData::End(const MatMulInfo &mmInfo)
+{
+    uint32_t cubeBlockSize = mmInfo.isInt8 ? CUBE_BLOCK_SIZE_INT8 : CUBE_BLOCK_SIZE;
+    uint32_t kBlockSize = mmInfo.isInt8 ? BLOCK_SIZE_INT8_K : BLOCK_SIZE;
+    uint32_t scaleBlockSize = mmInfo.isInt8 ? L1_DESCALE_BUFFER_LEN_MAX : 0;
+    uint32_t shapeSum = opShape.m0 + opShape.n0;
+    if (mmInfo.isInt8 && (mmInfo.transA || !mmInfo.transB)) {
+        shapeSum = RoundUp<uint32_t>(opShape.m0, CONST_32) + RoundUp<uint32_t>(opShape.n0, CONST_32);
+    }
+    uint32_t k0Max = shapeSum == 0
+                         ? L1AB_PINGPONG_BUFFER_LEN
+                         : static_cast<uint32_t>(static_cast<float>(L1AB_PINGPONG_BUFFER_LEN - scaleBlockSize) /
+                                                 (shapeSum * mmInfo.inDtype));
+    if (mmInfo.mmType == MatMul::MatMulType::MATMUL_WITH_BIAS) {
+        uint32_t l1AbSize = L1AB_PINGPONG_BUFFER_LEN - opShape.n0 * sizeof(float);
+        k0Max = l1AbSize / (shapeSum * mmInfo.inDtype);
+    }
+
+    opShape.k0 =
+        k0Max < cubeBlockSize ? RoundDown<uint32_t>(k0Max, kBlockSize) : RoundDown<uint32_t>(k0Max, cubeBlockSize);
+    if (opShape.k0 > CONST_512) {
+        opShape.k0 = RoundDown<uint32_t>(opShape.k0, CONST_512);
+    }
+    kLoop = CeilDiv(opShape.k, opShape.k0);
+    return blockDim;
+}
+
+void GetPpMatmulTiling(const MatMulInfo &mmInfo, const HardwareInfo &hwInfo, uint32_t &blockDim,
+                       PpMatmulTilingData &tilingData)
+{
+    OpShape opShape;
+    opShape.batchSize = mmInfo.batchSize;
+    opShape.m = mmInfo.m;
+    opShape.n = mmInfo.n;
+    opShape.k = mmInfo.k;
+    tilingData.opShape = opShape;
+    tilingData.quantMode = static_cast<uint32_t>(mmInfo.quantMode);
+    tilingData.SetTilingKey(mmInfo, 0, 0);  // init tilingkey with transA transB.
+    if (opShape.m < opShape.n) {
+        TilingFunc<false, OpShape, PpMatmulTilingData, HardwareInfo, MatMulInfo>(opShape, tilingData, hwInfo, mmInfo);
+    } else {
+        TilingFunc<true, OpShape, PpMatmulTilingData, HardwareInfo, MatMulInfo>(opShape, tilingData, hwInfo, mmInfo);
+    }
+    uint32_t direct = Swizzl<PpMatmulTilingData>(tilingData);
+    blockDim = tilingData.End(mmInfo);
+    tilingData.SetTilingKey(mmInfo, direct, 0);
+}
+}  // namespace pp_matmul
--- a/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.h
+++ b/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.h
@@ -0,0 +1,90 @@
+#ifndef PP_MATMUL_TILING_DATA
+#define PP_MATMUL_TILING_DATA
+#include <cstdint>
+
+namespace pp_matmul {
+struct MatMul {
+    enum class MatMulType : uint32_t {
+        MATMUL_DEFAULT = 0,   // C = op(A) * op(B)
+        MATMUL_DEQUANT,       //
+        MATMUL_ACCUM_ATOMIC,  // C += op(A) * op(B)
+        MATMUL_WITH_BIAS,     // C = op(A) * op(B) + Bias, where Bias is a vector.
+        MATMUL_EIN_SUM
+    };
+    enum class QuantMode : uint32_t { PER_CHANNEL_SYMM = 0, PER_CHANNEL_ASYMM, PER_TOKEN_SYMM };
+};
+
+enum class TensorDType : uint32_t { TENSOR_DTYPE_FLOAT16 = 0, TENSOR_DTYPE_BF16 };
+
+enum class TensorFormat : uint32_t { TENSOR_FORMAT_ND = 0, TENSOR_FORMAT_NZ };
+
+struct MatMulInfo {
+    uint32_t batchSize{0};
+    uint32_t m{0};  // actual input m
+    uint32_t k{0};  // actual input k
+    uint32_t n{0};  // actual input n
+    TensorDType dtypeA{TensorDType::TENSOR_DTYPE_FLOAT16};
+    TensorDType dtypeB{TensorDType::TENSOR_DTYPE_FLOAT16};
+    TensorDType dtypeC{TensorDType::TENSOR_DTYPE_FLOAT16};
+    TensorFormat formatA{TensorFormat::TENSOR_FORMAT_ND};
+    TensorFormat formatB{TensorFormat::TENSOR_FORMAT_ND};
+    TensorFormat formatC{TensorFormat::TENSOR_FORMAT_ND};
+    MatMul::MatMulType mmType{MatMul::MatMulType::MATMUL_DEFAULT};
+    bool transA{0};    // false: 0, true: 1
+    bool transB{0};    // false: 0, true: 1
+    bool biasFlag{0};  // false: 0, true: 1
+    bool isInt8{0};    // false: 0, true: 1
+    float inDtype{0};
+    float outDtype{0};
+    MatMul::QuantMode quantMode{MatMul::QuantMode::PER_CHANNEL_SYMM};
+};
+
+struct OpShape {
+    uint32_t batchSize{0};
+    uint32_t m{0};
+    uint32_t k{0};
+    uint32_t n{0};
+    uint32_t m0{0};
+    uint32_t k0{0};
+    uint32_t n0{0};
+};
+
+struct HardwareInfo {
+    uint32_t coreNum{0};
+    uint32_t l2Size{0};
+    uint32_t l1Size{0};
+    uint32_t l0aSize{0};
+    uint32_t l0bSize{0};
+    uint32_t l0cSize{0};
+    uint32_t hbmBandWidth{0};
+    uint32_t l2BandWidth{0};
+
+    HardwareInfo();
+};
+
+#pragma pack(push, 1)
+struct PpMatmulTilingData {
+    OpShape opShape{};
+    uint32_t mLoop{1};
+    uint32_t kLoop{1};
+    uint32_t nLoop{1};
+    uint32_t coreLoop{1};
+    uint32_t swizzlCount{1};
+    uint32_t tilingKey{0};
+    uint32_t blockDim{1};
+    uint32_t swizzlDirect{0};
+    uint32_t splitk{0};
+    uint32_t enShuffleK{0};
+    uint32_t quantMode{0};
+
+    void SetBaseShape(uint32_t batchSize, uint32_t m, uint32_t k, uint32_t n);
+    void SetBaseOp(uint32_t coreNum, uint32_t mBase, uint32_t nBase, const MatMulInfo &mmInfo);
+    void SetTilingKey(const MatMulInfo &mmInfo, uint32_t swizzleDirect, uint32_t enSplitK);
+    uint32_t End(const MatMulInfo &mmInfo);
+};
+#pragma pack(pop)
+
+void GetPpMatmulTiling(const MatMulInfo &mmInfo, const HardwareInfo &hwInfo, uint32_t &blockDim,
+                       PpMatmulTilingData &tilingData);
+}  // namespace pp_matmul
+#endif
--- a/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
+++ b/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
@@ -0,0 +1,825 @@
+// Adapted from
+//   https://gitee.com/ascend/ascend-transformer-boost
+//
+// Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+// This file is a part of the CANN Open Software.
+// Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+//
+
+#define __aicore__ [aicore]
+#include "kernel_operator.h"
+#include "../op_host/tiling/tiling_data.h"
+#include "../../mla_preprocess/op_kernel/kernel/common.h"
+#include "../../mla_preprocess/op_kernel/kernel/hardware.h"
+#include "../../mla_preprocess/op_kernel/kernel/mma.h"
+#include "../../mla_preprocess/op_kernel/kernel/utils.h"
+#include "../../mla_preprocess/op_kernel/kernel/iterator.h"
+#include "../../kernels/math_utils.h"
+
+constexpr uint32_t L0_PINGPONG_BUFFER_LEN = 16384;
+constexpr uint32_t L1_PINGPONG_BUFFER_LEN = 131072;
+constexpr uint32_t CONST_16 = 16;
+constexpr uint32_t CONST_256 = 256;
+constexpr uint64_t ND2NZ_STRIDE_LIMIT = 65536;
+constexpr uint64_t BLOCK_SIZE_16 = 16;
+constexpr uint64_t CONST_16UL = 16;
+constexpr uint64_t CONST_256UL = 256;
+
+struct MatCoord {
+    uint64_t m{0};
+    uint64_t k{0};
+    uint64_t n{0};
+};
+
+using namespace device_utils;
+
+template <uint32_t SwizzleDirect, bool TA, bool TB, typename InDtype = half, typename OutDtype = half,
+          DataFormat FormatB = DataFormat::ND>
+class PpMatmulEinSum
+{
+    using LocalTensor = AscendC::LocalTensor<InDtype>;
+    template <DataFormat srcFormat = DataFormat::ND, DataFormat dstFormat = DataFormat::ND>
+    using CopyGmToCbuf = gm_to_l1<ArchType::ASCEND_V220, InDtype, srcFormat, dstFormat>;
+    using LoadCbufToCa = l1_to_l0_a<ArchType::ASCEND_V220, InDtype, TA, DataFormat::ZN, DataFormat::ZZ>;
+    using LoadCbufToCb = l1_to_l0_b<ArchType::ASCEND_V220, InDtype, TB, DataFormat::ZN, DataFormat::NZ>;
+    using Mad = mmad<ArchType::ASCEND_V220, InDtype, InDtype, float, TA>;
+    using CopyCcToGm = l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, OutDtype, float>;
+
+public:
+    __aicore__ explicit PpMatmulEinSum(){};
+
+    __aicore__ __force_inline__ void Init(__gm__ uint8_t *__restrict__ a, __gm__ uint8_t *__restrict__ b,
+                                          __gm__ uint8_t *__restrict__ c, __gm__ uint8_t *__restrict__ tiling_data)
+    {
+        gm_a.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(a));
+        gm_b.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(b));
+        gm_c.SetGlobalBuffer(reinterpret_cast<__gm__ OutDtype *>(c));
+        auto gm_tiling_data = reinterpret_cast<__gm__ pp_matmul::PpMatmulTilingData *>(tiling_data);
+
+        batch_size = gm_tiling_data->opShape.batchSize;
+        m = gm_tiling_data->opShape.m;
+        k = gm_tiling_data->opShape.k;
+        n = gm_tiling_data->opShape.n;
+        m0 = gm_tiling_data->opShape.m0;
+        k0 = gm_tiling_data->opShape.k0;
+        n0 = gm_tiling_data->opShape.n0;
+        tdim.m = gm_tiling_data->mLoop;
+        tdim.k = gm_tiling_data->kLoop;
+        tdim.n = gm_tiling_data->nLoop;
+        core_loop = gm_tiling_data->coreLoop;
+        swizzle_cnt = gm_tiling_data->swizzlCount;
+        en_shuffle_k = gm_tiling_data->enShuffleK;
+
+        AsdopsBuffer<ArchType::ASCEND_V220> buf;
+        l1_base_a = buf.template GetBuffer<BufferType::ASCEND_CB, InDtype>(0);
+        l1_base_b = buf.template GetBuffer<BufferType::ASCEND_CB, InDtype>(
+            RoundUp<uint64_t>(m0 * k0 * sizeof(InDtype), CONST_256UL));
+        l0a_base = buf.template GetBuffer<BufferType::ASCEND_L0A, InDtype>(0);
+        l0b_base = buf.template GetBuffer<BufferType::ASCEND_L0B, InDtype>(0);
+        num_core = AscendC::GetBlockNum();
+        core_idx = AscendC::GetBlockIdx();
+        ping_flag = 1;
+    }
+
+    __aicore__ __force_inline__ void GetBlockIdx(uint64_t index, MatCoord &tidx)
+    {
+        uint64_t in_batch_idx = index % (tdim.m * tdim.n);
+        if constexpr (SwizzleDirect == 0) {  // Zn
+            uint64_t tile_block_loop = (tdim.m + swizzle_cnt - 1) / swizzle_cnt;
+            uint64_t tile_block_idx = in_batch_idx / (swizzle_cnt * tdim.n);
+            uint64_t in_tile_block_idx = in_batch_idx % (swizzle_cnt * tdim.n);
+
+            uint64_t n_row = swizzle_cnt;
+            if (tile_block_idx == tile_block_loop - 1) {
+                n_row = tdim.m - swizzle_cnt * tile_block_idx;
+            }
+            tidx.m = tile_block_idx * swizzle_cnt + in_tile_block_idx % n_row;
+            tidx.n = in_tile_block_idx / n_row;
+            if (tile_block_idx % 2 != 0) {
+                tidx.n = tdim.n - tidx.n - 1;
+            }
+        } else if constexpr (SwizzleDirect == 1) {  // Nz
+            uint64_t tile_block_loop = (tdim.n + swizzle_cnt - 1) / swizzle_cnt;
+            uint64_t tile_block_idx = in_batch_idx / (swizzle_cnt * tdim.m);
+            uint64_t in_tile_block_idx = in_batch_idx % (swizzle_cnt * tdim.m);
+
+            uint64_t n_col = swizzle_cnt;
+            if (tile_block_idx == tile_block_loop - 1) {
+                n_col = tdim.n - swizzle_cnt * tile_block_idx;
+            }
+            tidx.m = in_tile_block_idx / n_col;
+            tidx.n = tile_block_idx * swizzle_cnt + in_tile_block_idx % n_col;
+            if (tile_block_idx % 2 != 0) {
+                tidx.m = tdim.m - tidx.m - 1;
+            }
+        }
+    }
+
+    __aicore__ __force_inline__ void Process()
+    {
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+        for (uint64_t loop_idx = core_idx; loop_idx < core_loop; loop_idx += num_core) {
+            uint64_t batch_idx = loop_idx / tdim.n / tdim.m;
+            MatCoord tidx{0};
+            GetBlockIdx(loop_idx, tidx);
+            uint64_t offset_a = 0, offset_b = 0, offset_a_next = 0, offset_b_next = 0;
+            uint64_t offset_c = tidx.m * m0 * batch_size * n + batch_idx * n + tidx.n * n0;
+            uint64_t m_actual = (tidx.m == (tdim.m - 1)) ? (m - tidx.m * m0) : m0;
+            uint64_t n_actual = (tidx.n == (tdim.n - 1)) ? (n - tidx.n * n0) : n0;
+            uint64_t m_round = RoundUp<uint64_t, CONST_16UL>(m_actual);
+            uint64_t n_round = RoundUp<uint64_t, CONST_16UL>(n_actual);
+            uint64_t mn_max = m_round > n_round ? m_round : n_round;
+            uint64_t k_part_len = L0_PINGPONG_BUFFER_LEN / mn_max / CONST_16 * CONST_16;
+            uint64_t shuffle_k = en_shuffle_k ? (core_idx % tdim.k) : 0;
+            if (TA) {
+                offset_a = shuffle_k * k0 * m * batch_size + batch_idx * m + tidx.m * m0;
+            } else {
+                offset_a = tidx.m * m0 * batch_size * k + batch_idx * k + shuffle_k * k0;
+            }
+
+            if (TB) {
+                if constexpr (FormatB != DataFormat::NZ) {
+                    offset_b = batch_idx * k * n + tidx.n * n0 * k + shuffle_k * k0;
+                } else {
+                    offset_b = batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
+                               tidx.n * n0 * BLOCK_SIZE_16 + shuffle_k * k0 * RoundUp<uint64_t, CONST_16UL>(n);
+                }
+            } else {
+                if constexpr (FormatB != DataFormat::NZ) {
+                    offset_b = batch_idx * k * n + shuffle_k * k0 * n + tidx.n * n0;
+                } else {
+                    offset_b = batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
+                               shuffle_k * k0 * BLOCK_SIZE_16 + tidx.n * n0 * RoundUp<uint64_t, CONST_16UL>(k);
+                }
+            }
+
+            uint64_t k_actual = (shuffle_k == tdim.k - 1) ? k - shuffle_k * k0 : k0;
+            uint64_t k_round = (k_actual + CONST_16 - 1) / CONST_16 * CONST_16;
+
+            LocalTensor l1_buf_a = ping_flag ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
+            LocalTensor l1_buf_b = ping_flag ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
+            LocalTensor l0a_buf = ping_flag ? l0a_base : l0a_base[L0_PINGPONG_BUFFER_LEN];
+            LocalTensor l0b_buf = ping_flag ? l0b_base : l0b_base[L0_PINGPONG_BUFFER_LEN];
+            event_t event_id = ping_flag ? EVENT_ID0 : EVENT_ID1;
+
+            if (loop_idx == core_idx) {
+                WAIT_FLAG(MTE1, MTE2, event_id);
+                // *** load matrix A to L1
+                if ((m == 1) || (m_actual == 1 && !TA)) {
+                    CopyGmToCbuf<DataFormat::ND, DataFormat::ND>(l1_buf_a,        // dst
+                                                                 gm_a[offset_a],  // src
+                                                                 1,               // nTileActual
+                                                                 16,              // nTileCeil
+                                                                 1,               // nVal
+                                                                 k_actual,        // kTileActual
+                                                                 k_round,         // kTileCeil
+                                                                 k);              // dVal
+                } else {
+                    if (TA) {
+                        CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a,         // dst
+                                                                     gm_a[offset_a],   // src
+                                                                     k_actual,         // nTileActual
+                                                                     k_round,          // nTileCeil
+                                                                     k,                // nVal
+                                                                     m_actual,         // dTileActual
+                                                                     m_round,          // dTileCeil
+                                                                     m * batch_size);  // dVal
+                    } else {
+                        CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a,         // dst
+                                                                     gm_a[offset_a],   // src
+                                                                     m_actual,         // nTileActual
+                                                                     m_round,          // nTileCeil
+                                                                     m,                // nVal
+                                                                     k_actual,         // dTileActual
+                                                                     k_round,          // dTileCeil
+                                                                     k * batch_size);  // dVal
+                    }
+                }
+                SET_FLAG(MTE2, MTE1, event_id);
+                // *** load matrix B to L1
+                wait_flag(PIPE_MTE1, PIPE_MTE2, event_id + 2);
+                if constexpr (FormatB != DataFormat::NZ) {
+                    if (TB) {
+                        CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b,        // dst
+                                                                     gm_b[offset_b],  // src
+                                                                     n_actual,        // nTileActual
+                                                                     n_round,         // nTileCeil
+                                                                     n,               // nVal
+                                                                     k_actual,        // dTileActual
+                                                                     k_round,         // dTileCeil
+                                                                     k);              // dVal
+                    } else {
+                        CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b,        // dst
+                                                                     gm_b[offset_b],  // src
+                                                                     k_actual,        // nTileActual
+                                                                     k_round,         // nTileCeil
+                                                                     k,               // nVal
+                                                                     n_actual,        // dTileActual
+                                                                     n_round,         // dTileCeil
+                                                                     n);              // dVal
+                    }
+                } else {
+                    if (TB) {
+                        CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b,                           // dst
+                                                                     gm_b[offset_b],                     // src
+                                                                     n_actual,                           // nTileActual
+                                                                     n_round,                            // nTileCeil
+                                                                     RoundUp<uint64_t, CONST_16UL>(n),   // nVal
+                                                                     k_actual,                           // dTileActual
+                                                                     k_round,                            // dTileCeil
+                                                                     RoundUp<uint64_t, CONST_16UL>(k));  // dVal
+                    } else {
+                        CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b,                           // dst
+                                                                     gm_b[offset_b],                     // src
+                                                                     k_actual,                           // nTileActual
+                                                                     k_round,                            // nTileCeil
+                                                                     RoundUp<uint64_t, CONST_16UL>(k),   // nVal
+                                                                     n_actual,                           // dTileActual
+                                                                     n_round,                            // dTileCeil
+                                                                     RoundUp<uint64_t, CONST_16UL>(n));  // dVal
+                    }
+                }
+                SET_FLAG(MTE2, MTE1, event_id + 2);
+            }
+
+            for (tidx.k = 0; tidx.k < tdim.k; ++tidx.k) {
+                shuffle_k = en_shuffle_k ? (tidx.k + core_idx) % tdim.k : tidx.k;
+                uint64_t k_actual = (shuffle_k == (tdim.k - 1)) ? (k - shuffle_k * k0) : k0;
+                uint64_t k_round = (k_actual + CONST_16 - 1) / CONST_16 * CONST_16;
+                fdim.k = (k_actual + k_part_len - 1) / k_part_len;
+
+                LocalTensor l1_buf_a = ping_flag ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
+                LocalTensor l1_buf_b = ping_flag ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
+                auto event_id = ping_flag ? EVENT_ID0 : EVENT_ID1;
+
+                if (tidx.k < tdim.k - 1) {
+                    uint64_t shuffle_k_next = en_shuffle_k ? (core_idx + tidx.k + 1) % tdim.k : (tidx.k + 1);
+                    if (TA) {
+                        offset_a_next = shuffle_k_next * k0 * m * batch_size + batch_idx * m + tidx.m * m0;
+                    } else {
+                        offset_a_next = tidx.m * m0 * batch_size * k + batch_idx * k + shuffle_k_next * k0;
+                    }
+
+                    if (TB) {
+                        if constexpr (FormatB != DataFormat::NZ) {
+                            offset_b_next = batch_idx * k * n + tidx.n * n0 * k + shuffle_k_next * k0;
+                        } else {
+                            offset_b_next =
+                                batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
+                                tidx.n * n0 * BLOCK_SIZE_16 + shuffle_k_next * k0 * RoundUp<uint64_t, CONST_16UL>(n);
+                        }
+                    } else {
+                        if constexpr (FormatB != DataFormat::NZ) {
+                            offset_b_next = batch_idx * k * n + shuffle_k_next * k0 * n + tidx.n * n0;
+                        } else {
+                            offset_b_next =
+                                batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
+                                shuffle_k_next * k0 * BLOCK_SIZE_16 + tidx.n * n0 * RoundUp<uint64_t, CONST_16UL>(k);
+                        }
+                    }
+
+                    uint64_t k_actual_next = (shuffle_k_next == (tdim.k - 1)) ? (k - shuffle_k_next * k0) : k0;
+                    uint64_t k_round_next = (k_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
+
+                    LocalTensor l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
+                    LocalTensor l1_buf_b_next = (1 - ping_flag) ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
+                    event_t event_id_next = (1 - ping_flag) ? EVENT_ID0 : EVENT_ID1;
+
+                    WAIT_FLAG(MTE1, MTE2, event_id_next);
+                    // *** load matrix A to L1
+                    if ((m == 1) || (m_actual == 1 && !TA)) {
+                        CopyGmToCbuf<DataFormat::ND, DataFormat::ND>(l1_buf_a_next,        // dst
+                                                                     gm_a[offset_a_next],  // src
+                                                                     m_actual,             // nTileActual
+                                                                     m_round,              // nTileCeil
+                                                                     m,                    // nVal
+                                                                     k_actual_next,        // kTileActual
+                                                                     k_round_next,         // kTileCeil
+                                                                     k);                   // dVal
+                    } else {
+                        if (TA) {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next,        // dst
+                                                                         gm_a[offset_a_next],  // src
+                                                                         k_actual_next,        // nTileActual
+                                                                         k_round_next,         // nTileCeil
+                                                                         k,                    // nVal
+                                                                         m_actual,             // dTileActual
+                                                                         m_round,              // dTileCeil
+                                                                         m * batch_size);      // dVal
+                        } else {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next,        // dst
+                                                                         gm_a[offset_a_next],  // src
+                                                                         m_actual,             // nTileActual
+                                                                         m_round,              // nTileCeil
+                                                                         m,                    // nVal
+                                                                         k_actual_next,        // dTileActual
+                                                                         k_round_next,         // dTileCeil
+                                                                         k * batch_size);      // dVal
+                        }
+                    }
+                    SET_FLAG(MTE2, MTE1, event_id_next);
+
+                    // *** load matrix B to L1
+                    wait_flag(PIPE_MTE1, PIPE_MTE2, event_id_next + 2);
+                    if constexpr (FormatB != DataFormat::NZ) {
+                        if (TB) {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         n_actual,             // nTileActual
+                                                                         n_round,              // nTileCeil
+                                                                         n,                    // nVal
+                                                                         k_actual_next,        // dTileActual
+                                                                         k_round_next,         // dTileCeil
+                                                                         k);                   // dVal
+                        } else {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         k_actual_next,        // nTileActual
+                                                                         k_round_next,         // nTileCeil
+                                                                         k,                    // nVal
+                                                                         n_actual,             // dTileActual
+                                                                         n_round,              // dTileCeil
+                                                                         n);                   // dVal
+                        }
+                    } else {
+                        if (TB) {
+                            CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         n_actual,             // nTileActual
+                                                                         n_round,              // nTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(n),  // nVal
+                                                                         k_actual_next,  // dTileActual
+                                                                         k_round_next,   // dTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(k));  // dVal
+                        } else {
+                            CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         k_actual_next,        // nTileActual
+                                                                         k_round_next,         // nTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(k),  // nVal
+                                                                         n_actual,  // dTileActual
+                                                                         n_round,   // dTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(n));  // dVal
+                        }
+                    }
+                    SET_FLAG(MTE2, MTE1, event_id_next + 2);
+                }
+
+                if (tidx.k == tdim.k - 1 && loop_idx + num_core < core_loop) {
+                    uint64_t b_idx_next = (loop_idx + num_core) / tdim.n / tdim.m;
+                    MatCoord tidx{0};
+                    GetBlockIdx(loop_idx + num_core, tidx);
+                    uint64_t shuffle_k_next = en_shuffle_k ? (core_idx % tdim.k) : 0;
+                    uint64_t m_actual_next = (tidx.m == (tdim.m - 1)) ? (m - tidx.m * m0) : m0;
+                    uint64_t n_actual_next = (tidx.n == (tdim.n - 1)) ? (n - tidx.n * n0) : n0;
+                    uint64_t m_round_next = (m_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
+                    uint64_t n_round_next = (n_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
+                    uint64_t k_actual_next = (shuffle_k_next == (tdim.k - 1)) ? (k - shuffle_k_next * k0) : k0;
+                    uint64_t k_round_next = (k_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
+                    if (TA) {
+                        offset_a_next = shuffle_k_next * k0 * m * batch_size + b_idx_next * m + tidx.m * m0;
+                    } else {
+                        offset_a_next = tidx.m * m0 * batch_size * k + b_idx_next * k + shuffle_k_next * k0;
+                    }
+
+                    if (TB) {
+                        if constexpr (FormatB != DataFormat::NZ) {
+                            offset_b_next = b_idx_next * k * n + tidx.n * n0 * k + shuffle_k_next * k0;
+                        } else {
+                            offset_b_next =
+                                b_idx_next * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
+                                tidx.n * n0 * BLOCK_SIZE_16 + shuffle_k_next * k0 * RoundUp<uint64_t, CONST_16UL>(n);
+                        }
+                    } else {
+                        if constexpr (FormatB != DataFormat::NZ) {
+                            offset_b_next = b_idx_next * k * n + shuffle_k_next * k0 * n + tidx.n * n0;
+                        } else {
+                            offset_b_next =
+                                b_idx_next * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
+                                shuffle_k_next * k0 * BLOCK_SIZE_16 + tidx.n * n0 * RoundUp<uint64_t, CONST_16UL>(k);
+                        }
+                    }
+
+                    LocalTensor l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
+                    LocalTensor l1_buf_b_next = (1 - ping_flag) ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
+                    event_t event_id_next = (1 - ping_flag) ? EVENT_ID0 : EVENT_ID1;
+
+                    WAIT_FLAG(MTE1, MTE2, event_id_next);
+                    // *** load matrix A to L1
+                    if (m == 1 || m_actual_next == 1 && !TA) {
+                        CopyGmToCbuf<DataFormat::ND, DataFormat::ND>(l1_buf_a_next,        // dst
+                                                                     gm_a[offset_a_next],  // src
+                                                                     m_actual_next,        // nTileActual
+                                                                     m_round_next,         // nTileCeil
+                                                                     m,                    // nVal
+                                                                     k_actual_next,        // kTileActual
+                                                                     k_round_next,         // kTileCeil
+                                                                     k);                   // dVal
+                    } else {
+                        if (TA) {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next,        // dst
+                                                                         gm_a[offset_a_next],  // src
+                                                                         k_actual_next,        // nTileActual
+                                                                         k_round_next,         // nTileCeil
+                                                                         k,                    // nVal
+                                                                         m_actual_next,        // dTileActual
+                                                                         m_round_next,         // dTileCeil
+                                                                         m * batch_size);      // dVal
+                        } else {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next,        // dst
+                                                                         gm_a[offset_a_next],  // src
+                                                                         m_actual_next,        // nTileActual
+                                                                         m_round_next,         // nTileCeil
+                                                                         m,                    // nVal
+                                                                         k_actual_next,        // dTileActual
+                                                                         k_round_next,         // dTileCeil
+                                                                         k * batch_size);      // dVal
+                        }
+                    }
+                    SET_FLAG(MTE2, MTE1, event_id_next);
+
+                    // *** load matrix B to L1
+                    wait_flag(PIPE_MTE1, PIPE_MTE2, event_id_next + 2);
+                    if constexpr (FormatB != DataFormat::NZ) {
+                        if (TB) {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         n_actual_next,        // nTileActual
+                                                                         n_round_next,         // nTileCeil
+                                                                         n,                    // nVal
+                                                                         k_actual_next,        // dTileActual
+                                                                         k_round_next,         // dTileCeil
+                                                                         k);                   // dVal
+                        } else {
+                            CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         k_actual_next,        // nTileActual
+                                                                         k_round_next,         // nTileCeil
+                                                                         k,                    // nVal
+                                                                         n_actual_next,        // dTileActual
+                                                                         n_round_next,         // dTileCeil
+                                                                         n);                   // dVal
+                        }
+                    } else {
+                        if (TB) {
+                            CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         n_actual_next,        // nTileActual
+                                                                         n_round_next,         // nTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(n),  // nVal
+                                                                         k_actual_next,  // dTileActual
+                                                                         k_round_next,   // dTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(k));  // dVal
+                        } else {
+                            CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,        // dst
+                                                                         gm_b[offset_b_next],  // src
+                                                                         k_actual_next,        // nTileActual
+                                                                         k_round_next,         // nTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(k),  // nVal
+                                                                         n_actual_next,  // dTileActual
+                                                                         n_round_next,   // dTileCeil
+                                                                         RoundUp<uint64_t, CONST_16UL>(n));  // dVal
+                        }
+                    }
+                    SET_FLAG(MTE2, MTE1, event_id_next + 2);
+                }
+
+                MatCoord fidx{0};
+                for (fidx.k = 0; fidx.k < fdim.k; ++fidx.k) {
+                    uint32_t k0_round = (fidx.k < fdim.k - 1) ? k_part_len : k_round - fidx.k * k_part_len;
+                    uint32_t k0_actual = (fidx.k < fdim.k - 1) ? k_part_len : k_actual - fidx.k * k_part_len;
+
+                    auto mte1_mad_ping_flag = 1 - fidx.k % 2;
+                    auto mte1_mad_event_id = mte1_mad_ping_flag ? EVENT_ID0 : EVENT_ID1;
+                    auto l0a_buf = l0a_base[(fidx.k % 2) * L0_PINGPONG_BUFFER_LEN];
+                    auto l0b_buf = l0b_base[(fidx.k % 2) * L0_PINGPONG_BUFFER_LEN];
+
+                    // *** load matrix A from L1 to L0A
+                    if (fidx.k == 0) {
+                        WAIT_FLAG(MTE2, MTE1, event_id);
+                    }
+                    WAIT_FLAG(M, MTE1, mte1_mad_event_id);
+                    if ((m == 1) || (m_actual == 1 && !TA)) {
+                        l1_to_l0_a<ArchType::ASCEND_V220, InDtype, false, DataFormat::VECTOR, DataFormat::VECTOR>(
+                            l0a_buf,                        // dst
+                            l1_buf_a[fidx.k * k_part_len],  // src
+                            0,                              // mTileCeil
+                            CeilDiv<CONST_256>(k0_round),   // kPartCeil
+                            0,                              // mSrcStride
+                            1,                              // kSrcStride
+                            0,                              // mDstStride
+                            0);                             // kDstStride
+                    } else {
+                        if (TA) {
+                            LoadCbufToCa(l0a_buf,                                   // l0Tensor
+                                         l1_buf_a[fidx.k * k_part_len * CONST_16],  // l1Tensor
+                                         m_round,                                   // mTileCeil
+                                         k0_round,                                  // kPartCeil
+                                         k_round / CONST_16,                        // mSrcStride
+                                         1,                                         // kSrcStride
+                                         k0_round / CONST_16,                       // mDstStride
+                                         1);                                        // kDstStride
+                        } else {
+                            LoadCbufToCa(l0a_buf,                                  // l0Tensor
+                                         l1_buf_a[fidx.k * k_part_len * m_round],  // l1Tensor
+                                         m_round,                                  // mTileCeil
+                                         k0_round,                                 // kPartCeil
+                                         1,                                        // mSrcStride
+                                         m_round / CONST_16,                       // kSrcStride
+                                         k0_round / CONST_16,                      // mDstStride
+                                         1);                                       // kDstStride
+                        }
+                    }
+                    if (fidx.k == fdim.k - 1) {
+                        SET_FLAG(MTE1, MTE2, event_id);
+                    }
+
+                    // *** load matrix B from L1 to L0B
+                    if (fidx.k == 0) {
+                        WAIT_FLAG(MTE2, MTE1, event_id + 2);
+                    }
+                    if (TB) {
+                        LoadCbufToCb(l0b_buf,                                  // l0Tensor
+                                     l1_buf_b[fidx.k * k_part_len * n_round],  // l1Tensor
+                                     n_round,                                  // nTileCeil
+                                     k0_round,                                 // kPartCeil
+                                     1,                                        // nSrcStride
+                                     n_round / CONST_16,                       // kSrcStride
+                                     1,                                        // nDstStride
+                                     k0_round / CONST_16);                     // kDstStride
+                    } else {
+                        LoadCbufToCb(l0b_buf,                                   // l0Tensor
+                                     l1_buf_b[fidx.k * k_part_len * CONST_16],  // l1Tensor
+                                     n_round,                                   // nTileCeil
+                                     k0_round,                                  // kPartCeil
+                                     k_round / CONST_16,                        // nSrcStride
+                                     1,                                         // kSrcStride
+                                     1,                                         // nDstStride
+                                     n_round / CONST_16);                       // kDstStride
+                    }
+                    if (fidx.k == fdim.k - 1) {
+                        SET_FLAG(MTE1, MTE2, event_id + 2);
+                    }
+
+                    SET_FLAG(MTE1, M, mte1_mad_event_id);
+                    WAIT_FLAG(MTE1, M, mte1_mad_event_id);
+
+                    bool init_c = (tidx.k == 0 && fidx.k == 0);
+                    if (init_c) {
+                        WAIT_FLAG(FIX, M, EVENT_ID0);
+                    }
+
+                    if (m != 1 && m_actual == 1 && TA) {
+                        Mad(l0c_buf,    // c
+                            l0a_buf,    // a
+                            l0b_buf,    // b
+                            CONST_16,   // mTileActual
+                            n_actual,   // nTileActual
+                            k0_actual,  // kTileActual
+                            init_c);    // initC
+                    } else {
+                        Mad(l0c_buf,    // c
+                            l0a_buf,    // a
+                            l0b_buf,    // b
+                            m_actual,   // mTileActual
+                            n_actual,   // nTileActual
+                            k0_actual,  // kTileActual
+                            init_c);    // initC
+                    }
+
+                    PIPE_BARRIER(M);
+                    SET_FLAG(M, MTE1, mte1_mad_event_id);
+                }
+
+                ping_flag = 1 - ping_flag;
+            }
+
+            SET_FLAG(M, FIX, EVENT_ID0);
+            WAIT_FLAG(M, FIX, EVENT_ID0);
+
+            // copy from L0C to gm
+            CopyCcToGm(gm_c[offset_c],   // dst
+                       l0c_buf,          // src
+                       m_actual,         // mTileActual
+                       n_actual,         // nTileActual
+                       m_round,          // mTileCeil
+                       n * batch_size);  // nActual
+            SET_FLAG(FIX, M, EVENT_ID0);
+        }
+
+        WAIT_FLAG(M, MTE1, EVENT_ID0);
+        WAIT_FLAG(M, MTE1, EVENT_ID1);
+        WAIT_FLAG(MTE1, MTE2, EVENT_ID0);
+        WAIT_FLAG(MTE1, MTE2, EVENT_ID1);
+        WAIT_FLAG(MTE1, MTE2, EVENT_ID2);
+        WAIT_FLAG(MTE1, MTE2, EVENT_ID3);
+        WAIT_FLAG(FIX, M, EVENT_ID0);
+        PIPE_BARRIER(ALL);
+    }
+
+private:
+    AscendC::GlobalTensor<InDtype> gm_a;
+    AscendC::GlobalTensor<InDtype> gm_b;
+    AscendC::GlobalTensor<OutDtype> gm_c;
+    AscendC::LocalTensor<InDtype> l1_base_a;
+    AscendC::LocalTensor<InDtype> l1_base_b;
+    AscendC::LocalTensor<InDtype> l0a_base;
+    AscendC::LocalTensor<InDtype> l0b_base;
+    AscendC::LocalTensor<float> l0c_buf;
+
+    uint32_t num_core{0};
+    uint32_t batch_size{0};
+    uint32_t m{0};
+    uint32_t k{0};
+    uint32_t n{0};
+    uint32_t m0{0};
+    uint32_t k0{0};
+    uint32_t n0{0};
+    MatCoord tdim{0};
+    MatCoord fdim{0};
+    uint32_t core_loop{0};
+    uint32_t swizzle_cnt{1};
+    uint32_t core_idx{0};
+    uint32_t en_shuffle_k{0};
+    uint32_t ping_flag{0};
+};
+
+extern "C" __global__ __aicore__ void batch_matmul_transpose(GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_c,
+                                                             GM_ADDR gm_tiling_data)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY);
+    PpMatmulEinSum<0, false, false, half, half, DataFormat::ND>
+        einsum_0_n_fp16_nd;  // swizzleDir[0] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<1, false, false, half, half, DataFormat::ND>
+        einsum_1_n_fp16_nd;  // swizzleDir[1] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<0, false, true, half, half, DataFormat::ND>
+        einsum_0_t_fp16_nd;  // swizzleDir[0] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<1, false, true, half, half, DataFormat::ND>
+        einsum_1_t_fp16_nd;  // swizzleDir[1] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<0, false, false, __bf16, __bf16, DataFormat::ND>
+        einsum_0_n_bf16_nd;  // swizzleDir[0] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<1, false, false, __bf16, __bf16, DataFormat::ND>
+        einsum_1_n_bf16_nd;  // swizzleDir[1] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<0, false, true, __bf16, __bf16, DataFormat::ND>
+        einsum_0_t_bf16_nd;  // swizzleDir[0] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[0]
+    PpMatmulEinSum<1, false, true, __bf16, __bf16, DataFormat::ND>
+        einsum_1_t_bf16_nd;  // swizzleDir[1] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[0]
+
+    PpMatmulEinSum<0, false, false, half, half, DataFormat::NZ>
+        einsum_0_n_fp16_nz;  // swizzleDir[0] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<1, false, false, half, half, DataFormat::NZ>
+        einsum_1_n_fp16_nz;  // swizzleDir[1] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<0, false, true, half, half, DataFormat::NZ>
+        einsum_0_t_fp16_nz;  // swizzleDir[0] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<1, false, true, half, half, DataFormat::NZ>
+        einsum_1_t_fp16_nz;  // swizzleDir[1] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<0, false, false, __bf16, __bf16, DataFormat::NZ>
+        einsum_0_n_bf16_nz;  // swizzleDir[0] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<1, false, false, __bf16, __bf16, DataFormat::NZ>
+        einsum_1_n_bf16_nz;  // swizzleDir[1] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<0, false, true, __bf16, __bf16, DataFormat::NZ>
+        einsum_0_t_bf16_nz;  // swizzleDir[0] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[1]
+    PpMatmulEinSum<1, false, true, __bf16, __bf16, DataFormat::NZ>
+        einsum_1_t_bf16_nz;  // swizzleDir[1] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
+                             // DataFormatB[1]
+
+    SetPadding<uint64_t>((uint64_t)0);
+    SetNdpara(1, 0, 0);
+    SetAtomicnone();
+
+    // get tiling args
+    auto tiling_data = reinterpret_cast<__gm__ pp_matmul::PpMatmulTilingData *>(gm_tiling_data);
+    uint32_t masked_key = tiling_data->tilingKey >> 2;
+
+    switch (masked_key) {
+        case 0b00000100100100:
+        case 0b01000100100100:
+            einsum_0_n_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_n_fp16_nd.Process();
+            break;
+        case 0b00100100100100:
+        case 0b01100100100100:
+            einsum_0_t_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_t_fp16_nd.Process();
+            break;
+        case 0b10000100100100:
+        case 0b11000100100100:
+            einsum_1_n_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_n_fp16_nd.Process();
+            break;
+        case 0b10100100100100:
+        case 0b11100100100100:
+            einsum_1_t_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_t_fp16_nd.Process();
+            break;
+        case 0b00001001001000:
+        case 0b01001001001000:
+            einsum_0_n_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_n_bf16_nd.Process();
+            break;
+        case 0b00101001001000:
+        case 0b01101001001000:
+            einsum_0_t_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_t_bf16_nd.Process();
+            break;
+        case 0b10001001001000:
+        case 0b11001001001000:
+            einsum_1_n_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_n_bf16_nd.Process();
+            break;
+        case 0b10101001001000:
+        case 0b11101001001000:
+            einsum_1_t_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_t_bf16_nd.Process();
+            break;
+
+        case 0b00000100100101:
+        case 0b01000100100101:
+            einsum_0_n_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_n_fp16_nz.Process();
+            break;
+        case 0b00100100100101:
+        case 0b01100100100101:
+            einsum_0_t_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_t_fp16_nz.Process();
+            break;
+        case 0b10000100100101:
+        case 0b11000100100101:
+            einsum_1_n_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_n_fp16_nz.Process();
+            break;
+        case 0b10100100100101:
+        case 0b11100100100101:
+            einsum_1_t_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_t_fp16_nz.Process();
+            break;
+        case 0b00001001001001:
+        case 0b01001001001001:
+            einsum_0_n_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_n_bf16_nz.Process();
+            break;
+        case 0b00101001001001:
+        case 0b01101001001001:
+            einsum_0_t_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_0_t_bf16_nz.Process();
+            break;
+        case 0b10001001001001:
+        case 0b11001001001001:
+            einsum_1_n_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_n_bf16_nz.Process();
+            break;
+        case 0b10101001001001:
+        case 0b11101001001001:
+            einsum_1_t_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
+            einsum_1_t_bf16_nz.Process();
+            break;
+        default:
+            break;
+    }
+}
+
+
+namespace vllm_ascend {
+
+extern void batch_matmul_transpose_impl(
+    void* stream,
+    void* gm_a,
+    void* gm_b,
+    void* gm_c,
+    void* gm_tiling_data,
+    const uint32_t block_dim)
+{
+    batch_matmul_transpose<<<block_dim, nullptr, stream>>>(
+        gm_a,
+        gm_b,
+        gm_c,
+        gm_tiling_data);
+}
+
+}
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@@ -15,6 +15,11 @@
 */

 #include <iostream>
+#include <stdexcept>
+#include <string>
+#include <atomic>
+
+#include "idle_offload/shm_worker.h"

 extern "C" {

@@ -24,6 +29,13 @@ extern "C" {
 #include <sys/types.h>
 #include "acl/acl.h"

+// idle offload
+static std::atomic<bool> g_initialized(false);
+static void *g_d_mem = nullptr;
+static size_t g_size = 0;
+static std::atomic_uint_fast64_t g_allocated_offset(0);
+ShmWorker *shm_worker = nullptr;
+
 // Global references to Python callables
 // NOTE: this is borrowed reference, so we don't need to DECREF them.
 // This brings the limitation that the allocator needs to be singleton.
@@ -49,7 +61,7 @@ void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
  ensure_context(device);
  // Define memory allocation properties
  aclrtPhysicalMemProp prop = {};
-  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
  prop.memAttr = ACL_HBM_MEM_HUGE;
  prop.location.id = device;
@@ -59,15 +71,21 @@ void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
  // Allocate memory using aclrtMallocPhysical
  aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-            << __LINE__ << std::endl;  
-    return;
+    if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+      throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " + 
+                              std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " + 
+                              __FILE__ + ":" + std::to_string(__LINE__));
+    } else {
+      throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
+                              std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+    }
  }
+
+  // Map memory
  error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-            << __LINE__ << std::endl;  
-    return;
+    throw std::runtime_error("aclrtMapMem failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
  }
 }

@@ -79,15 +97,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
  ensure_context(device);
  aclError error_code = aclrtUnmapMem(d_mem);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-            << __LINE__ << std::endl;  
-    return;
+    throw std::runtime_error("aclrtUnmapMem failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
  }
  error_code = aclrtFreePhysical(*p_memHandle);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-            << __LINE__ << std::endl;  
-    return;
+    throw std::runtime_error("aclrtFreePhysical failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
  }
 }

@@ -139,25 +155,29 @@ __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device
                                   ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
                                   &granularity);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-            << __LINE__ << std::endl;  
-    return nullptr;
+    throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
  }
  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
  void *d_mem;
  error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-                << __LINE__ << std::endl;  
-    return nullptr;
+    if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+      throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " + 
+                              std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " + 
+                              __FILE__ + ":" + std::to_string(__LINE__));
+    } else {
+      throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
+                              std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+    }
  }
  // allocate the aclrtDrvMemHandle
  aclrtDrvMemHandle* p_memHandle =
      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));

  if (!g_python_malloc_callback) {
-    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
-    return nullptr;
+    throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
+                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
  }

  // Acquire GIL (not in stable ABI officially, but often works)
@@ -189,8 +209,8 @@ __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device
 __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
  // get memory handle from the pointer
  if (!g_python_free_callback) {
-    std::cerr << "ERROR: g_python_free_callback not set.\n";
-    return;
+    throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
+                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
  }

  // Acquire GIL (not in stable ABI officially, but often works)
@@ -232,13 +252,150 @@ __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, in
  // free address and the handle
  aclError error_code = aclrtReleaseMemAddress(d_mem);
  if (error_code != 0) {
-    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
-        << __LINE__ << std::endl;  
-    return;
+    throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
  }
  free(p_memHandle);
 }

+__attribute__((visibility("default"))) void *
+my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
+  ensure_context(device);
+
+  // first allocation, align the size, and reserve an address, and also allocate
+  // a aclrtDrvMemHandle
+
+  // Define memory allocation properties
+  aclrtPhysicalMemProp prop = {};
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+  prop.memAttr = ACL_HBM_MEM_HUGE;
+  prop.location.id = device;
+  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+  prop.reserve = 0;
+
+  // Check if the allocation is supported
+  size_t granularity;
+  aclError error_code = aclrtMemGetAllocationGranularity(&prop,
+                                   ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
+                                   &granularity);
+  if (error_code != 0) {
+    throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
+  void *d_mem;
+  // error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
+  // if (error_code != 0) {
+  //   if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+  //     throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " + 
+  //                             std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " + 
+  //                             __FILE__ + ":" + std::to_string(__LINE__));
+  //   } else {
+  //     throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
+  //                             std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+  //   }
+  // }
+
+  // allocate from the reserved pool
+  size_t alloc_offset = g_allocated_offset.fetch_add(alignedSize);
+  if (alloc_offset + alignedSize > g_size) {
+    throw std::runtime_error(
+        "my_malloc ERROR: Out of memory in the reserved pool." +
+        std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+  d_mem = (void *)((char *)g_d_mem + alloc_offset);
+
+  // allocate the aclrtDrvMemHandle
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
+
+  if (!g_python_malloc_callback) {
+    throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
+                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* arg_tuple = create_tuple_from_c_integers(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+
+  // Call g_python_malloc_callback
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
+  Py_DECREF(arg_tuple);
+
+  if (!py_result) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return nullptr;
+  }
+
+  PyGILState_Release(gstate);
+
+  // // do the final mapping
+  // create_and_map(device, alignedSize, d_mem, p_memHandle);
+
+  return (void*)d_mem;
+}
+
+__attribute__((visibility("default"))) void
+my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
+  // get memory handle from the pointer
+  if (!g_python_free_callback) {
+    throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
+                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* py_ptr =
+      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
+
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
+
+  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return;
+  }
+
+  PyGILState_Release(gstate);
+
+  // recv_size == size
+  // recv_device == device
+
+  // Free memory
+
+  // nothing to do
+  
+  // void *d_mem = (void*)recv_d_mem;
+  //   // allocate the aclrtDrvMemHandle
+  // aclrtDrvMemHandle* p_memHandle =
+  //     (aclrtDrvMemHandle*)recv_p_memHandle;
+  // unmap_and_release(device, size, d_mem, p_memHandle);
+
+  // // free address and the handle
+  // aclError error_code = aclrtReleaseMemAddress(d_mem);
+  // if (error_code != 0) {
+  //   throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
+  //                           std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+  // }
+  // free(p_memHandle);
+}
+
 // ---------------------------------------------------------------------------
 // Python extension boilerplate:

@@ -313,6 +470,116 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
  Py_RETURN_NONE;
 }

+
+static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
+  PyObject* malloc_callback = nullptr;
+  PyObject* free_callback = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+    return nullptr;
+  }
+
+  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
+    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
+    return nullptr;
+  }
+
+  // Save the Python callables
+  // This module does not handle GC of these objects, so they must be kept alive
+  // outside of this module.
+  g_python_malloc_callback = malloc_callback;
+  g_python_free_callback = free_callback;
+
+  // init idle
+  if (g_initialized.load()) {
+    printf("Module already initialized.\n");
+    Py_RETURN_NONE;
+  }
+  g_initialized.store(true);
+
+  shm_worker = new ShmWorker();
+  // get pid
+  aclError error_code;
+  int32_t pid;
+  error_code = aclrtDeviceGetBareTgid(&pid);
+  if (error_code != 0) {
+    throw std::runtime_error(
+        "aclrtDeviceGetBareTgid failed with acl error code: " +
+        std::to_string(error_code) + " " + __FILE__ + ":" +
+        std::to_string(__LINE__));
+  }
+  uint64_t shareable_handle;
+  shm_worker->register_worker(pid, &shareable_handle, &g_size);
+
+  // import shareable handle
+  uint32_t device = 0;
+  aclrtDrvMemHandle memHandle;
+  error_code =
+      aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
+  if (error_code != 0) {
+    throw std::runtime_error(
+        "aclrtMemImportFromShareableHandle failed with acl error code: " +
+        std::to_string(error_code) + " " + __FILE__ + ":" +
+        std::to_string(__LINE__));
+  }
+
+  // reserve virtual address
+  error_code = aclrtReserveMemAddress(&g_d_mem, g_size, 0, nullptr, 0);
+  if (error_code != 0) {
+    throw std::runtime_error(
+        "aclrtReserveMemAddress failed with acl error code: " +
+        std::to_string(error_code) + " " + __FILE__ + ":" +
+        std::to_string(__LINE__));
+  }
+  // map
+  error_code = aclrtMapMem(g_d_mem, g_size, 0, memHandle, 0);
+  if (error_code != 0) {
+    throw std::runtime_error("aclrtMapMem failed with acl error code: " +
+                             std::to_string(error_code) + " " + __FILE__ + ":" +
+                             std::to_string(__LINE__));
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *python_unmap_and_release_offload(PyObject *self,
+                                                  PyObject *args) {
+  // nothing to do
+  Py_RETURN_NONE;
+}
+
+static PyObject *python_create_and_map_offload(PyObject *self, PyObject *args) {
+  // nothing to do
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
+  size_t allocated_bytes = g_allocated_offset.load();
+  size_t free_mem = 0;
+  if (allocated_bytes >= g_size) {
+    free_mem = 0;
+  } else {
+    free_mem = g_size - allocated_bytes;
+  }
+  PyObject* tuple = PyTuple_New(2);
+  if (!tuple) {  
+    return nullptr;
+  }
+  PyTuple_SetItem(tuple, 0, PyLong_FromSize_t(free_mem));
+  PyTuple_SetItem(tuple, 1, PyLong_FromSize_t(g_size));
+  return tuple;
+}
+
+static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
+  bool prev_is_self = shm_worker->lock_gpu();
+  return PyBool_FromLong(prev_is_self);
+}
+
+static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
+  shm_worker->unlock_gpu();
+  Py_RETURN_NONE;
+}
+
 static PyMethodDef module_methods[] = {
    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
     "Initialize module with python_malloc and python_free callables."},
@@ -320,7 +587,21 @@ static PyMethodDef module_methods[] = {
     "Create and map memory on the device."},
    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
     METH_VARARGS, "Unmap and release memory on the device."},
-    {NULL, NULL, 0, NULL}  // sentinel
+    {"init_module_offload", (PyCFunction)py_init_module_offload, METH_VARARGS,
+     "Initialize module with python_malloc and python_free callables."},
+    {"python_create_and_map_offload",
+     (PyCFunction)python_create_and_map_offload, METH_VARARGS,
+     "Create and map memory on the device."},
+    {"python_unmap_and_release_offload",
+     (PyCFunction)python_unmap_and_release_offload, METH_VARARGS,
+     "Unmap and release memory on the device."},
+    {"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
+     METH_NOARGS, "Get mem info in the reserved pool."},
+    {"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
+     METH_NOARGS, "Lock GPU."},
+    {"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
+     METH_NOARGS, "Unlock GPU."},
+    {NULL, NULL, 0, NULL} // sentinel
 };

 static struct PyModuleDef camem_allocator_module = {
--- a/csrc/idle_offload/.gitignore
+++ b/csrc/idle_offload/.gitignore
@@ -0,0 +1 @@
+vllm_vnpu_daemon
--- a/csrc/idle_offload/Makefile
+++ b/csrc/idle_offload/Makefile
@@ -0,0 +1,30 @@
+CXX := g++
+TARGET := vllm_vnpu_daemon
+SRCS := offload_daemon.cpp shm_manager.cpp
+
+ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
+INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
+LIBS := -L$(ASCEND_HOME)/lib64 -lascendcl
+
+CXXFLAGS := $(INCLUDES)
+LDFLAGS := $(LIBS)
+
+PREFIX ?= /usr/local
+BINDIR ?= $(PREFIX)/bin
+
+.PHONY: all clean install uninstall
+
+all: $(TARGET)
+
+$(TARGET): $(SRCS)
+	$(CXX) -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
+
+install: $(TARGET)
+	install -d $(DESTDIR)$(BINDIR)
+	install -m 0755 $(TARGET) $(DESTDIR)$(BINDIR)/$(TARGET)
+
+uninstall:
+	rm -f $(DESTDIR)$(BINDIR)/$(TARGET)
+
+clean:
+	rm -f $(TARGET)
--- a/csrc/idle_offload/include/spdlog/async.h
+++ b/csrc/idle_offload/include/spdlog/async.h
@@ -0,0 +1,99 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+//
+// Async logging using global thread pool
+// All loggers created here share same global thread pool.
+// Each log message is pushed to a queue along with a shared pointer to the
+// logger.
+// If a logger deleted while having pending messages in the queue, it's actual
+// destruction will defer
+// until all its messages are processed by the thread pool.
+// This is because each message in the queue holds a shared_ptr to the
+// originating logger.
+
+#include <spdlog/async_logger.h>
+#include <spdlog/details/registry.h>
+#include <spdlog/details/thread_pool.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+
+namespace spdlog {
+
+namespace details {
+static const size_t default_async_q_size = 8192;
+}
+
+// async logger factory - creates async loggers backed with thread pool.
+// if a global thread pool doesn't already exist, create it with default queue
+// size of 8192 items and single thread.
+template <async_overflow_policy OverflowPolicy = async_overflow_policy::block>
+struct async_factory_impl {
+    template <typename Sink, typename... SinkArgs>
+    static std::shared_ptr<async_logger> create(std::string logger_name, SinkArgs &&...args) {
+        auto &registry_inst = details::registry::instance();
+
+        // create global thread pool if not already exists..
+
+        auto &mutex = registry_inst.tp_mutex();
+        std::lock_guard<std::recursive_mutex> tp_lock(mutex);
+        auto tp = registry_inst.get_tp();
+        if (tp == nullptr) {
+            tp = std::make_shared<details::thread_pool>(details::default_async_q_size, 1U);
+            registry_inst.set_tp(tp);
+        }
+
+        auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
+        auto new_logger = std::make_shared<async_logger>(std::move(logger_name), std::move(sink),
+                                                         std::move(tp), OverflowPolicy);
+        registry_inst.initialize_logger(new_logger);
+        return new_logger;
+    }
+};
+
+using async_factory = async_factory_impl<async_overflow_policy::block>;
+using async_factory_nonblock = async_factory_impl<async_overflow_policy::overrun_oldest>;
+
+template <typename Sink, typename... SinkArgs>
+inline std::shared_ptr<spdlog::logger> create_async(std::string logger_name,
+                                                    SinkArgs &&...sink_args) {
+    return async_factory::create<Sink>(std::move(logger_name),
+                                       std::forward<SinkArgs>(sink_args)...);
+}
+
+template <typename Sink, typename... SinkArgs>
+inline std::shared_ptr<spdlog::logger> create_async_nb(std::string logger_name,
+                                                       SinkArgs &&...sink_args) {
+    return async_factory_nonblock::create<Sink>(std::move(logger_name),
+                                                std::forward<SinkArgs>(sink_args)...);
+}
+
+// set global thread pool.
+inline void init_thread_pool(size_t q_size,
+                             size_t thread_count,
+                             std::function<void()> on_thread_start,
+                             std::function<void()> on_thread_stop) {
+    auto tp = std::make_shared<details::thread_pool>(q_size, thread_count, on_thread_start,
+                                                     on_thread_stop);
+    details::registry::instance().set_tp(std::move(tp));
+}
+
+inline void init_thread_pool(size_t q_size,
+                             size_t thread_count,
+                             std::function<void()> on_thread_start) {
+    init_thread_pool(q_size, thread_count, on_thread_start, [] {});
+}
+
+inline void init_thread_pool(size_t q_size, size_t thread_count) {
+    init_thread_pool(q_size, thread_count, [] {}, [] {});
+}
+
+// get the global thread pool.
+inline std::shared_ptr<spdlog::details::thread_pool> thread_pool() {
+    return details::registry::instance().get_tp();
+}
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/async_logger-inl.h
+++ b/csrc/idle_offload/include/spdlog/async_logger-inl.h
@@ -0,0 +1,84 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/async_logger.h>
+#endif
+
+#include <spdlog/details/thread_pool.h>
+#include <spdlog/sinks/sink.h>
+
+#include <memory>
+#include <string>
+
+SPDLOG_INLINE spdlog::async_logger::async_logger(std::string logger_name,
+                                                 sinks_init_list sinks_list,
+                                                 std::weak_ptr<details::thread_pool> tp,
+                                                 async_overflow_policy overflow_policy)
+    : async_logger(std::move(logger_name),
+                   sinks_list.begin(),
+                   sinks_list.end(),
+                   std::move(tp),
+                   overflow_policy) {}
+
+SPDLOG_INLINE spdlog::async_logger::async_logger(std::string logger_name,
+                                                 sink_ptr single_sink,
+                                                 std::weak_ptr<details::thread_pool> tp,
+                                                 async_overflow_policy overflow_policy)
+    : async_logger(
+          std::move(logger_name), {std::move(single_sink)}, std::move(tp), overflow_policy) {}
+
+// send the log message to the thread pool
+SPDLOG_INLINE void spdlog::async_logger::sink_it_(const details::log_msg &msg){
+    SPDLOG_TRY{if (auto pool_ptr = thread_pool_.lock()){
+        pool_ptr -> post_log(shared_from_this(), msg, overflow_policy_);
+}
+else {
+    throw_spdlog_ex("async log: thread pool doesn't exist anymore");
+}
+}
+SPDLOG_LOGGER_CATCH(msg.source)
+}
+
+// send flush request to the thread pool
+SPDLOG_INLINE void spdlog::async_logger::flush_(){
+    SPDLOG_TRY{if (auto pool_ptr = thread_pool_.lock()){
+        pool_ptr -> post_flush(shared_from_this(), overflow_policy_);
+}
+else {
+    throw_spdlog_ex("async flush: thread pool doesn't exist anymore");
+}
+}
+SPDLOG_LOGGER_CATCH(source_loc())
+}
+
+//
+// backend functions - called from the thread pool to do the actual job
+//
+SPDLOG_INLINE void spdlog::async_logger::backend_sink_it_(const details::log_msg &msg) {
+    for (auto &sink : sinks_) {
+        if (sink->should_log(msg.level)) {
+            SPDLOG_TRY { sink->log(msg); }
+            SPDLOG_LOGGER_CATCH(msg.source)
+        }
+    }
+
+    if (should_flush_(msg)) {
+        backend_flush_();
+    }
+}
+
+SPDLOG_INLINE void spdlog::async_logger::backend_flush_() {
+    for (auto &sink : sinks_) {
+        SPDLOG_TRY { sink->flush(); }
+        SPDLOG_LOGGER_CATCH(source_loc())
+    }
+}
+
+SPDLOG_INLINE std::shared_ptr<spdlog::logger> spdlog::async_logger::clone(std::string new_name) {
+    auto cloned = std::make_shared<spdlog::async_logger>(*this);
+    cloned->name_ = std::move(new_name);
+    return cloned;
+}
--- a/csrc/idle_offload/include/spdlog/async_logger.h
+++ b/csrc/idle_offload/include/spdlog/async_logger.h
@@ -0,0 +1,74 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+// Fast asynchronous logger.
+// Uses pre allocated queue.
+// Creates a single back thread to pop messages from the queue and log them.
+//
+// Upon each log write the logger:
+//    1. Checks if its log level is enough to log the message
+//    2. Push a new copy of the message to a queue (or block the caller until
+//    space is available in the queue)
+// Upon destruction, logs all remaining messages in the queue before
+// destructing..
+
+#include <spdlog/logger.h>
+
+namespace spdlog {
+
+// Async overflow policy - block by default.
+enum class async_overflow_policy {
+    block,           // Block until message can be enqueued
+    overrun_oldest,  // Discard oldest message in the queue if full when trying to
+                     // add new item.
+    discard_new      // Discard new message if the queue is full when trying to add new item.
+};
+
+namespace details {
+class thread_pool;
+}
+
+class SPDLOG_API async_logger final : public std::enable_shared_from_this<async_logger>,
+                                      public logger {
+    friend class details::thread_pool;
+
+public:
+    template <typename It>
+    async_logger(std::string logger_name,
+                 It begin,
+                 It end,
+                 std::weak_ptr<details::thread_pool> tp,
+                 async_overflow_policy overflow_policy = async_overflow_policy::block)
+        : logger(std::move(logger_name), begin, end),
+          thread_pool_(std::move(tp)),
+          overflow_policy_(overflow_policy) {}
+
+    async_logger(std::string logger_name,
+                 sinks_init_list sinks_list,
+                 std::weak_ptr<details::thread_pool> tp,
+                 async_overflow_policy overflow_policy = async_overflow_policy::block);
+
+    async_logger(std::string logger_name,
+                 sink_ptr single_sink,
+                 std::weak_ptr<details::thread_pool> tp,
+                 async_overflow_policy overflow_policy = async_overflow_policy::block);
+
+    std::shared_ptr<logger> clone(std::string new_name) override;
+
+protected:
+    void sink_it_(const details::log_msg &msg) override;
+    void flush_() override;
+    void backend_sink_it_(const details::log_msg &incoming_log_msg);
+    void backend_flush_();
+
+private:
+    std::weak_ptr<details::thread_pool> thread_pool_;
+    async_overflow_policy overflow_policy_;
+};
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "async_logger-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/cfg/argv.h
+++ b/csrc/idle_offload/include/spdlog/cfg/argv.h
@@ -0,0 +1,40 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+#include <spdlog/cfg/helpers.h>
+#include <spdlog/details/registry.h>
+
+//
+// Init log levels using each argv entry that starts with "SPDLOG_LEVEL="
+//
+// set all loggers to debug level:
+// example.exe "SPDLOG_LEVEL=debug"
+
+// set logger1 to trace level
+// example.exe "SPDLOG_LEVEL=logger1=trace"
+
+// turn off all logging except for logger1 and logger2:
+// example.exe "SPDLOG_LEVEL=off,logger1=debug,logger2=info"
+
+namespace spdlog {
+namespace cfg {
+
+// search for SPDLOG_LEVEL= in the args and use it to init the levels
+inline void load_argv_levels(int argc, const char **argv) {
+    const std::string spdlog_level_prefix = "SPDLOG_LEVEL=";
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg.find(spdlog_level_prefix) == 0) {
+            auto levels_string = arg.substr(spdlog_level_prefix.size());
+            helpers::load_levels(levels_string);
+        }
+    }
+}
+
+inline void load_argv_levels(int argc, char **argv) {
+    load_argv_levels(argc, const_cast<const char **>(argv));
+}
+
+}  // namespace cfg
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/cfg/env.h
+++ b/csrc/idle_offload/include/spdlog/cfg/env.h
@@ -0,0 +1,36 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+#include <spdlog/cfg/helpers.h>
+#include <spdlog/details/os.h>
+#include <spdlog/details/registry.h>
+
+//
+// Init levels and patterns from env variables SPDLOG_LEVEL
+// Inspired from Rust's "env_logger" crate (https://crates.io/crates/env_logger).
+// Note - fallback to "info" level on unrecognized levels
+//
+// Examples:
+//
+// set global level to debug:
+// export SPDLOG_LEVEL=debug
+//
+// turn off all logging except for logger1:
+// export SPDLOG_LEVEL="*=off,logger1=debug"
+//
+
+// turn off all logging except for logger1 and logger2:
+// export SPDLOG_LEVEL="off,logger1=debug,logger2=info"
+
+namespace spdlog {
+namespace cfg {
+inline void load_env_levels(const char* var = "SPDLOG_LEVEL") {
+    auto env_val = details::os::getenv(var);
+    if (!env_val.empty()) {
+        helpers::load_levels(env_val);
+    }
+}
+
+}  // namespace cfg
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/cfg/helpers-inl.h
+++ b/csrc/idle_offload/include/spdlog/cfg/helpers-inl.h
@@ -0,0 +1,106 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/cfg/helpers.h>
+#endif
+
+#include <spdlog/details/os.h>
+#include <spdlog/details/registry.h>
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <utility>
+
+namespace spdlog {
+namespace cfg {
+namespace helpers {
+
+// inplace convert to lowercase
+inline std::string &to_lower_(std::string &str) {
+    std::transform(str.begin(), str.end(), str.begin(), [](char ch) {
+        return static_cast<char>((ch >= 'A' && ch <= 'Z') ? ch + ('a' - 'A') : ch);
+    });
+    return str;
+}
+
+// inplace trim spaces
+inline std::string &trim_(std::string &str) {
+    const char *spaces = " \n\r\t";
+    str.erase(str.find_last_not_of(spaces) + 1);
+    str.erase(0, str.find_first_not_of(spaces));
+    return str;
+}
+
+// return (name,value) trimmed pair from the given "name = value" string.
+// return empty string on missing parts
+// "key=val" => ("key", "val")
+// " key  =  val " => ("key", "val")
+// "key=" => ("key", "")
+// "val" => ("", "val")
+
+inline std::pair<std::string, std::string> extract_kv_(char sep, const std::string &str) {
+    auto n = str.find(sep);
+    std::string k, v;
+    if (n == std::string::npos) {
+        v = str;
+    } else {
+        k = str.substr(0, n);
+        v = str.substr(n + 1);
+    }
+    return std::make_pair(trim_(k), trim_(v));
+}
+
+// return vector of key/value pairs from a sequence of "K1=V1,K2=V2,.."
+// "a=AAA,b=BBB,c=CCC,.." => {("a","AAA"),("b","BBB"),("c", "CCC"),...}
+inline std::unordered_map<std::string, std::string> extract_key_vals_(const std::string &str) {
+    std::string token;
+    std::istringstream token_stream(str);
+    std::unordered_map<std::string, std::string> rv{};
+    while (std::getline(token_stream, token, ',')) {
+        if (token.empty()) {
+            continue;
+        }
+        auto kv = extract_kv_('=', token);
+        rv[kv.first] = kv.second;
+    }
+    return rv;
+}
+
+SPDLOG_INLINE void load_levels(const std::string &input) {
+    if (input.empty() || input.size() >= 32768) {
+        return;
+    }
+
+    auto key_vals = extract_key_vals_(input);
+    std::unordered_map<std::string, level::level_enum> levels;
+    level::level_enum global_level = level::info;
+    bool global_level_found = false;
+
+    for (auto &name_level : key_vals) {
+        const auto &logger_name = name_level.first;
+        const auto &level_name = to_lower_(name_level.second);
+        auto level = level::from_str(level_name);
+        // ignore unrecognized level names
+        if (level == level::off && level_name != "off") {
+            continue;
+        }
+        if (logger_name.empty())  // no logger name indicates global level
+        {
+            global_level_found = true;
+            global_level = level;
+        } else {
+            levels[logger_name] = level;
+        }
+    }
+
+    details::registry::instance().set_levels(std::move(levels),
+                                             global_level_found ? &global_level : nullptr);
+}
+
+}  // namespace helpers
+}  // namespace cfg
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/cfg/helpers.h
+++ b/csrc/idle_offload/include/spdlog/cfg/helpers.h
@@ -0,0 +1,29 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <unordered_map>
+
+namespace spdlog {
+namespace cfg {
+namespace helpers {
+//
+// Init levels from given string
+//
+// Examples:
+//
+// set global level to debug: "debug"
+// turn off all logging except for logger1: "off,logger1=debug"
+// turn off all logging except for logger1 and logger2: "off,logger1=debug,logger2=info"
+//
+SPDLOG_API void load_levels(const std::string &txt);
+}  // namespace helpers
+
+}  // namespace cfg
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "helpers-inl.h"
+#endif  // SPDLOG_HEADER_ONLY
--- a/csrc/idle_offload/include/spdlog/common-inl.h
+++ b/csrc/idle_offload/include/spdlog/common-inl.h
@@ -0,0 +1,68 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/common.h>
+#endif
+
+#include <algorithm>
+#include <iterator>
+
+namespace spdlog {
+namespace level {
+
+#if __cplusplus >= 201703L
+constexpr
+#endif
+    static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES;
+
+static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES;
+
+SPDLOG_INLINE const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
+    return level_string_views[l];
+}
+
+SPDLOG_INLINE const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
+    return short_level_names[l];
+}
+
+SPDLOG_INLINE spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT {
+    auto it = std::find(std::begin(level_string_views), std::end(level_string_views), name);
+    if (it != std::end(level_string_views))
+        return static_cast<level::level_enum>(std::distance(std::begin(level_string_views), it));
+
+    // check also for "warn" and "err" before giving up..
+    if (name == "warn") {
+        return level::warn;
+    }
+    if (name == "err") {
+        return level::err;
+    }
+    return level::off;
+}
+}  // namespace level
+
+SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg)
+    : msg_(std::move(msg)) {}
+
+SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno) {
+#ifdef SPDLOG_USE_STD_FORMAT
+    msg_ = std::system_error(std::error_code(last_errno, std::generic_category()), msg).what();
+#else
+    memory_buf_t outbuf;
+    fmt::format_system_error(outbuf, last_errno, msg.c_str());
+    msg_ = fmt::to_string(outbuf);
+#endif
+}
+
+SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT { return msg_.c_str(); }
+
+SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno) {
+    SPDLOG_THROW(spdlog_ex(msg, last_errno));
+}
+
+SPDLOG_INLINE void throw_spdlog_ex(std::string msg) { SPDLOG_THROW(spdlog_ex(std::move(msg))); }
+
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/common.h
+++ b/csrc/idle_offload/include/spdlog/common.h
@@ -0,0 +1,406 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/tweakme.h>
+
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <exception>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#ifdef SPDLOG_USE_STD_FORMAT
+#include <version>
+#if __cpp_lib_format >= 202207L
+#include <format>
+#else
+#include <string_view>
+#endif
+#endif
+
+#ifdef SPDLOG_COMPILED_LIB
+#undef SPDLOG_HEADER_ONLY
+#if defined(SPDLOG_SHARED_LIB)
+#if defined(_WIN32)
+#ifdef spdlog_EXPORTS
+#define SPDLOG_API __declspec(dllexport)
+#else  // !spdlog_EXPORTS
+#define SPDLOG_API __declspec(dllimport)
+#endif
+#else  // !defined(_WIN32)
+#define SPDLOG_API __attribute__((visibility("default")))
+#endif
+#else  // !defined(SPDLOG_SHARED_LIB)
+#define SPDLOG_API
+#endif
+#define SPDLOG_INLINE
+#else  // !defined(SPDLOG_COMPILED_LIB)
+#define SPDLOG_API
+#define SPDLOG_HEADER_ONLY
+#define SPDLOG_INLINE inline
+#endif  // #ifdef SPDLOG_COMPILED_LIB
+
+#include <spdlog/fmt/fmt.h>
+
+#if !defined(SPDLOG_USE_STD_FORMAT) && \
+    FMT_VERSION >= 80000  // backward compatibility with fmt versions older than 8
+#define SPDLOG_FMT_RUNTIME(format_string) fmt::runtime(format_string)
+#define SPDLOG_FMT_STRING(format_string) FMT_STRING(format_string)
+#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+#include <spdlog/fmt/xchar.h>
+#endif
+#else
+#define SPDLOG_FMT_RUNTIME(format_string) format_string
+#define SPDLOG_FMT_STRING(format_string) format_string
+#endif
+
+// visual studio up to 2013 does not support noexcept nor constexpr
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define SPDLOG_NOEXCEPT _NOEXCEPT
+#define SPDLOG_CONSTEXPR
+#else
+#define SPDLOG_NOEXCEPT noexcept
+#define SPDLOG_CONSTEXPR constexpr
+#endif
+
+// If building with std::format, can just use constexpr, otherwise if building with fmt
+// SPDLOG_CONSTEXPR_FUNC needs to be set the same as FMT_CONSTEXPR to avoid situations where
+// a constexpr function in spdlog could end up calling a non-constexpr function in fmt
+// depending on the compiler
+// If fmt determines it can't use constexpr, we should inline the function instead
+#ifdef SPDLOG_USE_STD_FORMAT
+#define SPDLOG_CONSTEXPR_FUNC constexpr
+#else  // Being built with fmt
+#if FMT_USE_CONSTEXPR
+#define SPDLOG_CONSTEXPR_FUNC FMT_CONSTEXPR
+#else
+#define SPDLOG_CONSTEXPR_FUNC inline
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define SPDLOG_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define SPDLOG_DEPRECATED __declspec(deprecated)
+#else
+#define SPDLOG_DEPRECATED
+#endif
+
+// disable thread local on msvc 2013
+#ifndef SPDLOG_NO_TLS
+#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__cplusplus_winrt)
+#define SPDLOG_NO_TLS 1
+#endif
+#endif
+
+#ifndef SPDLOG_FUNCTION
+#define SPDLOG_FUNCTION static_cast<const char *>(__FUNCTION__)
+#endif
+
+#ifdef SPDLOG_NO_EXCEPTIONS
+#define SPDLOG_TRY
+#define SPDLOG_THROW(ex)                               \
+    do {                                               \
+        printf("spdlog fatal error: %s\n", ex.what()); \
+        std::abort();                                  \
+    } while (0)
+#define SPDLOG_CATCH_STD
+#else
+#define SPDLOG_TRY try
+#define SPDLOG_THROW(ex) throw(ex)
+#define SPDLOG_CATCH_STD             \
+    catch (const std::exception &) { \
+    }
+#endif
+
+namespace spdlog {
+
+class formatter;
+
+namespace sinks {
+class sink;
+}
+
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+using filename_t = std::wstring;
+// allow macro expansion to occur in SPDLOG_FILENAME_T
+#define SPDLOG_FILENAME_T_INNER(s) L##s
+#define SPDLOG_FILENAME_T(s) SPDLOG_FILENAME_T_INNER(s)
+#else
+using filename_t = std::string;
+#define SPDLOG_FILENAME_T(s) s
+#endif
+
+using log_clock = std::chrono::system_clock;
+using sink_ptr = std::shared_ptr<sinks::sink>;
+using sinks_init_list = std::initializer_list<sink_ptr>;
+using err_handler = std::function<void(const std::string &err_msg)>;
+#ifdef SPDLOG_USE_STD_FORMAT
+namespace fmt_lib = std;
+
+using string_view_t = std::string_view;
+using memory_buf_t = std::string;
+
+template <typename... Args>
+#if __cpp_lib_format >= 202207L
+using format_string_t = std::format_string<Args...>;
+#else
+using format_string_t = std::string_view;
+#endif
+
+template <class T, class Char = char>
+struct is_convertible_to_basic_format_string
+    : std::integral_constant<bool, std::is_convertible<T, std::basic_string_view<Char>>::value> {};
+
+#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+using wstring_view_t = std::wstring_view;
+using wmemory_buf_t = std::wstring;
+
+template <typename... Args>
+#if __cpp_lib_format >= 202207L
+using wformat_string_t = std::wformat_string<Args...>;
+#else
+using wformat_string_t = std::wstring_view;
+#endif
+#endif
+#define SPDLOG_BUF_TO_STRING(x) x
+#else  // use fmt lib instead of std::format
+namespace fmt_lib = fmt;
+
+using string_view_t = fmt::basic_string_view<char>;
+using memory_buf_t = fmt::basic_memory_buffer<char, 250>;
+
+template <typename... Args>
+using format_string_t = fmt::format_string<Args...>;
+
+template <class T>
+using remove_cvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+template <typename Char>
+#if FMT_VERSION >= 90101
+using fmt_runtime_string = fmt::runtime_format_string<Char>;
+#else
+using fmt_runtime_string = fmt::basic_runtime<Char>;
+#endif
+
+// clang doesn't like SFINAE disabled constructor in std::is_convertible<> so have to repeat the
+// condition from basic_format_string here, in addition, fmt::basic_runtime<Char> is only
+// convertible to basic_format_string<Char> but not basic_string_view<Char>
+template <class T, class Char = char>
+struct is_convertible_to_basic_format_string
+    : std::integral_constant<bool,
+                             std::is_convertible<T, fmt::basic_string_view<Char>>::value ||
+                                 std::is_same<remove_cvref_t<T>, fmt_runtime_string<Char>>::value> {
+};
+
+#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+using wstring_view_t = fmt::basic_string_view<wchar_t>;
+using wmemory_buf_t = fmt::basic_memory_buffer<wchar_t, 250>;
+
+template <typename... Args>
+using wformat_string_t = fmt::wformat_string<Args...>;
+#endif
+#define SPDLOG_BUF_TO_STRING(x) fmt::to_string(x)
+#endif
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+#ifndef _WIN32
+#error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
+#endif  // _WIN32
+#endif  // SPDLOG_WCHAR_TO_UTF8_SUPPORT
+
+template <class T>
+struct is_convertible_to_any_format_string
+    : std::integral_constant<bool,
+                             is_convertible_to_basic_format_string<T, char>::value ||
+                                 is_convertible_to_basic_format_string<T, wchar_t>::value> {};
+
+#if defined(SPDLOG_NO_ATOMIC_LEVELS)
+using level_t = details::null_atomic_int;
+#else
+using level_t = std::atomic<int>;
+#endif
+
+#define SPDLOG_LEVEL_TRACE 0
+#define SPDLOG_LEVEL_DEBUG 1
+#define SPDLOG_LEVEL_INFO 2
+#define SPDLOG_LEVEL_WARN 3
+#define SPDLOG_LEVEL_ERROR 4
+#define SPDLOG_LEVEL_CRITICAL 5
+#define SPDLOG_LEVEL_OFF 6
+
+#if !defined(SPDLOG_ACTIVE_LEVEL)
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+#endif
+
+// Log level enum
+namespace level {
+enum level_enum : int {
+    trace = SPDLOG_LEVEL_TRACE,
+    debug = SPDLOG_LEVEL_DEBUG,
+    info = SPDLOG_LEVEL_INFO,
+    warn = SPDLOG_LEVEL_WARN,
+    err = SPDLOG_LEVEL_ERROR,
+    critical = SPDLOG_LEVEL_CRITICAL,
+    off = SPDLOG_LEVEL_OFF,
+    n_levels
+};
+
+#define SPDLOG_LEVEL_NAME_TRACE spdlog::string_view_t("trace", 5)
+#define SPDLOG_LEVEL_NAME_DEBUG spdlog::string_view_t("debug", 5)
+#define SPDLOG_LEVEL_NAME_INFO spdlog::string_view_t("info", 4)
+#define SPDLOG_LEVEL_NAME_WARNING spdlog::string_view_t("warning", 7)
+#define SPDLOG_LEVEL_NAME_ERROR spdlog::string_view_t("error", 5)
+#define SPDLOG_LEVEL_NAME_CRITICAL spdlog::string_view_t("critical", 8)
+#define SPDLOG_LEVEL_NAME_OFF spdlog::string_view_t("off", 3)
+
+#if !defined(SPDLOG_LEVEL_NAMES)
+#define SPDLOG_LEVEL_NAMES                                                                  \
+    {                                                                                       \
+        SPDLOG_LEVEL_NAME_TRACE, SPDLOG_LEVEL_NAME_DEBUG, SPDLOG_LEVEL_NAME_INFO,           \
+            SPDLOG_LEVEL_NAME_WARNING, SPDLOG_LEVEL_NAME_ERROR, SPDLOG_LEVEL_NAME_CRITICAL, \
+            SPDLOG_LEVEL_NAME_OFF                                                           \
+    }
+#endif
+
+#if !defined(SPDLOG_SHORT_LEVEL_NAMES)
+
+#define SPDLOG_SHORT_LEVEL_NAMES \
+    { "T", "D", "I", "W", "E", "C", "O" }
+#endif
+
+SPDLOG_API const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
+SPDLOG_API const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
+SPDLOG_API spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT;
+
+}  // namespace level
+
+//
+// Color mode used by sinks with color support.
+//
+enum class color_mode { always, automatic, never };
+
+//
+// Pattern time - specific time getting to use for pattern_formatter.
+// local time by default
+//
+enum class pattern_time_type {
+    local,  // log localtime
+    utc     // log utc
+};
+
+//
+// Log exception
+//
+class SPDLOG_API spdlog_ex : public std::exception {
+public:
+    explicit spdlog_ex(std::string msg);
+    spdlog_ex(const std::string &msg, int last_errno);
+    const char *what() const SPDLOG_NOEXCEPT override;
+
+private:
+    std::string msg_;
+};
+
+[[noreturn]] SPDLOG_API void throw_spdlog_ex(const std::string &msg, int last_errno);
+[[noreturn]] SPDLOG_API void throw_spdlog_ex(std::string msg);
+
+struct source_loc {
+    SPDLOG_CONSTEXPR source_loc() = default;
+    SPDLOG_CONSTEXPR source_loc(const char *filename_in, int line_in, const char *funcname_in)
+        : filename{filename_in},
+          line{line_in},
+          funcname{funcname_in} {}
+
+    SPDLOG_CONSTEXPR bool empty() const SPDLOG_NOEXCEPT { return line <= 0; }
+    const char *filename{nullptr};
+    int line{0};
+    const char *funcname{nullptr};
+};
+
+struct file_event_handlers {
+    file_event_handlers()
+        : before_open(nullptr),
+          after_open(nullptr),
+          before_close(nullptr),
+          after_close(nullptr) {}
+
+    std::function<void(const filename_t &filename)> before_open;
+    std::function<void(const filename_t &filename, std::FILE *file_stream)> after_open;
+    std::function<void(const filename_t &filename, std::FILE *file_stream)> before_close;
+    std::function<void(const filename_t &filename)> after_close;
+};
+
+namespace details {
+
+// to_string_view
+
+SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(const memory_buf_t &buf)
+    SPDLOG_NOEXCEPT {
+    return spdlog::string_view_t{buf.data(), buf.size()};
+}
+
+SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(spdlog::string_view_t str)
+    SPDLOG_NOEXCEPT {
+    return str;
+}
+
+#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(const wmemory_buf_t &buf)
+    SPDLOG_NOEXCEPT {
+    return spdlog::wstring_view_t{buf.data(), buf.size()};
+}
+
+SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(spdlog::wstring_view_t str)
+    SPDLOG_NOEXCEPT {
+    return str;
+}
+#endif
+
+#if defined(SPDLOG_USE_STD_FORMAT) && __cpp_lib_format >= 202207L
+template <typename T, typename... Args>
+SPDLOG_CONSTEXPR_FUNC std::basic_string_view<T> to_string_view(
+    std::basic_format_string<T, Args...> fmt) SPDLOG_NOEXCEPT {
+    return fmt.get();
+}
+#endif
+
+// make_unique support for pre c++14
+#if __cplusplus >= 201402L  // C++14 and beyond
+using std::enable_if_t;
+using std::make_unique;
+#else
+template <bool B, class T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args &&...args) {
+    static_assert(!std::is_array<T>::value, "arrays not supported");
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#endif
+
+// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
+template <typename T, typename U, enable_if_t<!std::is_same<T, U>::value, int> = 0>
+constexpr T conditional_static_cast(U value) {
+    return static_cast<T>(value);
+}
+
+template <typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
+constexpr T conditional_static_cast(U value) {
+    return value;
+}
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "common-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/backtracer-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/backtracer-inl.h
@@ -0,0 +1,63 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/backtracer.h>
+#endif
+namespace spdlog {
+namespace details {
+SPDLOG_INLINE backtracer::backtracer(const backtracer &other) {
+    std::lock_guard<std::mutex> lock(other.mutex_);
+    enabled_ = other.enabled();
+    messages_ = other.messages_;
+}
+
+SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT {
+    std::lock_guard<std::mutex> lock(other.mutex_);
+    enabled_ = other.enabled();
+    messages_ = std::move(other.messages_);
+}
+
+SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    enabled_ = other.enabled();
+    messages_ = std::move(other.messages_);
+    return *this;
+}
+
+SPDLOG_INLINE void backtracer::enable(size_t size) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    enabled_.store(true, std::memory_order_relaxed);
+    messages_ = circular_q<log_msg_buffer>{size};
+}
+
+SPDLOG_INLINE void backtracer::disable() {
+    std::lock_guard<std::mutex> lock{mutex_};
+    enabled_.store(false, std::memory_order_relaxed);
+}
+
+SPDLOG_INLINE bool backtracer::enabled() const { return enabled_.load(std::memory_order_relaxed); }
+
+SPDLOG_INLINE void backtracer::push_back(const log_msg &msg) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    messages_.push_back(log_msg_buffer{msg});
+}
+
+SPDLOG_INLINE bool backtracer::empty() const {
+    std::lock_guard<std::mutex> lock{mutex_};
+    return messages_.empty();
+}
+
+// pop all items in the q and apply the given fun on each of them.
+SPDLOG_INLINE void backtracer::foreach_pop(std::function<void(const details::log_msg &)> fun) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    while (!messages_.empty()) {
+        auto &front_msg = messages_.front();
+        fun(front_msg);
+        messages_.pop_front();
+    }
+}
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/backtracer.h
+++ b/csrc/idle_offload/include/spdlog/details/backtracer.h
@@ -0,0 +1,45 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/circular_q.h>
+#include <spdlog/details/log_msg_buffer.h>
+
+#include <atomic>
+#include <functional>
+#include <mutex>
+
+// Store log messages in circular buffer.
+// Useful for storing debug data in case of error/warning happens.
+
+namespace spdlog {
+namespace details {
+class SPDLOG_API backtracer {
+    mutable std::mutex mutex_;
+    std::atomic<bool> enabled_{false};
+    circular_q<log_msg_buffer> messages_;
+
+public:
+    backtracer() = default;
+    backtracer(const backtracer &other);
+
+    backtracer(backtracer &&other) SPDLOG_NOEXCEPT;
+    backtracer &operator=(backtracer other);
+
+    void enable(size_t size);
+    void disable();
+    bool enabled() const;
+    void push_back(const log_msg &msg);
+    bool empty() const;
+
+    // pop all items in the q and apply the given fun on each of them.
+    void foreach_pop(std::function<void(const details::log_msg &)> fun);
+};
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "backtracer-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/circular_q.h
+++ b/csrc/idle_offload/include/spdlog/details/circular_q.h
@@ -0,0 +1,115 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+// circular q view of std::vector.
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "spdlog/common.h"
+
+namespace spdlog {
+namespace details {
+template <typename T>
+class circular_q {
+    size_t max_items_ = 0;
+    typename std::vector<T>::size_type head_ = 0;
+    typename std::vector<T>::size_type tail_ = 0;
+    size_t overrun_counter_ = 0;
+    std::vector<T> v_;
+
+public:
+    using value_type = T;
+
+    // empty ctor - create a disabled queue with no elements allocated at all
+    circular_q() = default;
+
+    explicit circular_q(size_t max_items)
+        : max_items_(max_items + 1)  // one item is reserved as marker for full q
+          ,
+          v_(max_items_) {}
+
+    circular_q(const circular_q &) = default;
+    circular_q &operator=(const circular_q &) = default;
+
+    // move cannot be default,
+    // since we need to reset head_, tail_, etc to zero in the moved object
+    circular_q(circular_q &&other) SPDLOG_NOEXCEPT { copy_moveable(std::move(other)); }
+
+    circular_q &operator=(circular_q &&other) SPDLOG_NOEXCEPT {
+        copy_moveable(std::move(other));
+        return *this;
+    }
+
+    // push back, overrun (oldest) item if no room left
+    void push_back(T &&item) {
+        if (max_items_ > 0) {
+            v_[tail_] = std::move(item);
+            tail_ = (tail_ + 1) % max_items_;
+
+            if (tail_ == head_)  // overrun last item if full
+            {
+                head_ = (head_ + 1) % max_items_;
+                ++overrun_counter_;
+            }
+        }
+    }
+
+    // Return reference to the front item.
+    // If there are no elements in the container, the behavior is undefined.
+    const T &front() const { return v_[head_]; }
+
+    T &front() { return v_[head_]; }
+
+    // Return number of elements actually stored
+    size_t size() const {
+        if (tail_ >= head_) {
+            return tail_ - head_;
+        } else {
+            return max_items_ - (head_ - tail_);
+        }
+    }
+
+    // Return const reference to item by index.
+    // If index is out of range 0…size()-1, the behavior is undefined.
+    const T &at(size_t i) const {
+        assert(i < size());
+        return v_[(head_ + i) % max_items_];
+    }
+
+    // Pop item from front.
+    // If there are no elements in the container, the behavior is undefined.
+    void pop_front() { head_ = (head_ + 1) % max_items_; }
+
+    bool empty() const { return tail_ == head_; }
+
+    bool full() const {
+        // head is ahead of the tail by 1
+        if (max_items_ > 0) {
+            return ((tail_ + 1) % max_items_) == head_;
+        }
+        return false;
+    }
+
+    size_t overrun_counter() const { return overrun_counter_; }
+
+    void reset_overrun_counter() { overrun_counter_ = 0; }
+
+private:
+    // copy from other&& and reset it to disabled state
+    void copy_moveable(circular_q &&other) SPDLOG_NOEXCEPT {
+        max_items_ = other.max_items_;
+        head_ = other.head_;
+        tail_ = other.tail_;
+        overrun_counter_ = other.overrun_counter_;
+        v_ = std::move(other.v_);
+
+        // put &&other in disabled, but valid state
+        other.max_items_ = 0;
+        other.head_ = other.tail_ = 0;
+        other.overrun_counter_ = 0;
+    }
+};
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/console_globals.h
+++ b/csrc/idle_offload/include/spdlog/details/console_globals.h
@@ -0,0 +1,28 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <mutex>
+#include <spdlog/details/null_mutex.h>
+
+namespace spdlog {
+namespace details {
+
+struct console_mutex {
+    using mutex_t = std::mutex;
+    static mutex_t &mutex() {
+        static mutex_t s_mutex;
+        return s_mutex;
+    }
+};
+
+struct console_nullmutex {
+    using mutex_t = null_mutex;
+    static mutex_t &mutex() {
+        static mutex_t s_mutex;
+        return s_mutex;
+    }
+};
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/file_helper-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/file_helper-inl.h
@@ -0,0 +1,151 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/file_helper.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/details/os.h>
+
+#include <cerrno>
+#include <cstdio>
+#include <string>
+#include <tuple>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE file_helper::file_helper(const file_event_handlers &event_handlers)
+    : event_handlers_(event_handlers) {}
+
+SPDLOG_INLINE file_helper::~file_helper() { close(); }
+
+SPDLOG_INLINE void file_helper::open(const filename_t &fname, bool truncate) {
+    close();
+    filename_ = fname;
+
+    auto *mode = SPDLOG_FILENAME_T("ab");
+    auto *trunc_mode = SPDLOG_FILENAME_T("wb");
+
+    if (event_handlers_.before_open) {
+        event_handlers_.before_open(filename_);
+    }
+    for (int tries = 0; tries < open_tries_; ++tries) {
+        // create containing folder if not exists already.
+        os::create_dir(os::dir_name(fname));
+        if (truncate) {
+            // Truncate by opening-and-closing a tmp file in "wb" mode, always
+            // opening the actual log-we-write-to in "ab" mode, since that
+            // interacts more politely with eternal processes that might
+            // rotate/truncate the file underneath us.
+            std::FILE *tmp;
+            if (os::fopen_s(&tmp, fname, trunc_mode)) {
+                continue;
+            }
+            std::fclose(tmp);
+        }
+        if (!os::fopen_s(&fd_, fname, mode)) {
+            if (event_handlers_.after_open) {
+                event_handlers_.after_open(filename_, fd_);
+            }
+            return;
+        }
+
+        details::os::sleep_for_millis(open_interval_);
+    }
+
+    throw_spdlog_ex("Failed opening file " + os::filename_to_str(filename_) + " for writing",
+                    errno);
+}
+
+SPDLOG_INLINE void file_helper::reopen(bool truncate) {
+    if (filename_.empty()) {
+        throw_spdlog_ex("Failed re opening file - was not opened before");
+    }
+    this->open(filename_, truncate);
+}
+
+SPDLOG_INLINE void file_helper::flush() {
+    if (std::fflush(fd_) != 0) {
+        throw_spdlog_ex("Failed flush to file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE void file_helper::sync() {
+    if (!os::fsync(fd_)) {
+        throw_spdlog_ex("Failed to fsync file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE void file_helper::close() {
+    if (fd_ != nullptr) {
+        if (event_handlers_.before_close) {
+            event_handlers_.before_close(filename_, fd_);
+        }
+
+        std::fclose(fd_);
+        fd_ = nullptr;
+
+        if (event_handlers_.after_close) {
+            event_handlers_.after_close(filename_);
+        }
+    }
+}
+
+SPDLOG_INLINE void file_helper::write(const memory_buf_t &buf) {
+    if (fd_ == nullptr) return;
+    size_t msg_size = buf.size();
+    auto data = buf.data();
+
+    if (!details::os::fwrite_bytes(data, msg_size, fd_)) {
+        throw_spdlog_ex("Failed writing to file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE size_t file_helper::size() const {
+    if (fd_ == nullptr) {
+        throw_spdlog_ex("Cannot use size() on closed file " + os::filename_to_str(filename_));
+    }
+    return os::filesize(fd_);
+}
+
+SPDLOG_INLINE const filename_t &file_helper::filename() const { return filename_; }
+
+//
+// return file path and its extension:
+//
+// "mylog.txt" => ("mylog", ".txt")
+// "mylog" => ("mylog", "")
+// "mylog." => ("mylog.", "")
+// "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
+//
+// the starting dot in filenames is ignored (hidden files):
+//
+// ".mylog" => (".mylog". "")
+// "my_folder/.mylog" => ("my_folder/.mylog", "")
+// "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
+SPDLOG_INLINE std::tuple<filename_t, filename_t> file_helper::split_by_extension(
+    const filename_t &fname) {
+    auto ext_index = fname.rfind('.');
+
+    // no valid extension found - return whole path and empty string as
+    // extension
+    if (ext_index == filename_t::npos || ext_index == 0 || ext_index == fname.size() - 1) {
+        return std::make_tuple(fname, filename_t());
+    }
+
+    // treat cases like "/etc/rc.d/somelogfile or "/abc/.hiddenfile"
+    auto folder_index = fname.find_last_of(details::os::folder_seps_filename);
+    if (folder_index != filename_t::npos && folder_index >= ext_index - 1) {
+        return std::make_tuple(fname, filename_t());
+    }
+
+    // finally - return a valid base and extension tuple
+    return std::make_tuple(fname.substr(0, ext_index), fname.substr(ext_index));
+}
+
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/file_helper.h
+++ b/csrc/idle_offload/include/spdlog/details/file_helper.h
@@ -0,0 +1,61 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <tuple>
+
+namespace spdlog {
+namespace details {
+
+// Helper class for file sinks.
+// When failing to open a file, retry several times(5) with a delay interval(10 ms).
+// Throw spdlog_ex exception on errors.
+
+class SPDLOG_API file_helper {
+public:
+    file_helper() = default;
+    explicit file_helper(const file_event_handlers &event_handlers);
+
+    file_helper(const file_helper &) = delete;
+    file_helper &operator=(const file_helper &) = delete;
+    ~file_helper();
+
+    void open(const filename_t &fname, bool truncate = false);
+    void reopen(bool truncate);
+    void flush();
+    void sync();
+    void close();
+    void write(const memory_buf_t &buf);
+    size_t size() const;
+    const filename_t &filename() const;
+
+    //
+    // return file path and its extension:
+    //
+    // "mylog.txt" => ("mylog", ".txt")
+    // "mylog" => ("mylog", "")
+    // "mylog." => ("mylog.", "")
+    // "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
+    //
+    // the starting dot in filenames is ignored (hidden files):
+    //
+    // ".mylog" => (".mylog". "")
+    // "my_folder/.mylog" => ("my_folder/.mylog", "")
+    // "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
+    static std::tuple<filename_t, filename_t> split_by_extension(const filename_t &fname);
+
+private:
+    const int open_tries_ = 5;
+    const unsigned int open_interval_ = 10;
+    std::FILE *fd_{nullptr};
+    filename_t filename_;
+    file_event_handlers event_handlers_;
+};
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "file_helper-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/fmt_helper.h
+++ b/csrc/idle_offload/include/spdlog/details/fmt_helper.h
@@ -0,0 +1,141 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+#pragma once
+
+#include <chrono>
+#include <iterator>
+#include <spdlog/common.h>
+#include <spdlog/fmt/fmt.h>
+#include <type_traits>
+
+#ifdef SPDLOG_USE_STD_FORMAT
+#include <charconv>
+#include <limits>
+#endif
+
+// Some fmt helpers to efficiently format and pad ints and strings
+namespace spdlog {
+namespace details {
+namespace fmt_helper {
+
+inline void append_string_view(spdlog::string_view_t view, memory_buf_t &dest) {
+    auto *buf_ptr = view.data();
+    dest.append(buf_ptr, buf_ptr + view.size());
+}
+
+#ifdef SPDLOG_USE_STD_FORMAT
+template <typename T>
+inline void append_int(T n, memory_buf_t &dest) {
+    // Buffer should be large enough to hold all digits (digits10 + 1) and a sign
+    SPDLOG_CONSTEXPR const auto BUF_SIZE = std::numeric_limits<T>::digits10 + 2;
+    char buf[BUF_SIZE];
+
+    auto [ptr, ec] = std::to_chars(buf, buf + BUF_SIZE, n, 10);
+    if (ec == std::errc()) {
+        dest.append(buf, ptr);
+    } else {
+        throw_spdlog_ex("Failed to format int", static_cast<int>(ec));
+    }
+}
+#else
+template <typename T>
+inline void append_int(T n, memory_buf_t &dest) {
+    fmt::format_int i(n);
+    dest.append(i.data(), i.data() + i.size());
+}
+#endif
+
+template <typename T>
+SPDLOG_CONSTEXPR_FUNC unsigned int count_digits_fallback(T n) {
+    // taken from fmt: https://github.com/fmtlib/fmt/blob/8.0.1/include/fmt/format.h#L899-L912
+    unsigned int count = 1;
+    for (;;) {
+        // Integer division is slow so do it for a group of four digits instead
+        // of for every digit. The idea comes from the talk by Alexandrescu
+        // "Three Optimization Tips for C++". See speed-test for a comparison.
+        if (n < 10) return count;
+        if (n < 100) return count + 1;
+        if (n < 1000) return count + 2;
+        if (n < 10000) return count + 3;
+        n /= 10000u;
+        count += 4;
+    }
+}
+
+template <typename T>
+inline unsigned int count_digits(T n) {
+    using count_type =
+        typename std::conditional<(sizeof(T) > sizeof(uint32_t)), uint64_t, uint32_t>::type;
+#ifdef SPDLOG_USE_STD_FORMAT
+    return count_digits_fallback(static_cast<count_type>(n));
+#else
+    return static_cast<unsigned int>(fmt::
+// fmt 7.0.0 renamed the internal namespace to detail.
+// See: https://github.com/fmtlib/fmt/issues/1538
+#if FMT_VERSION < 70000
+                                         internal
+#else
+                                         detail
+#endif
+                                     ::count_digits(static_cast<count_type>(n)));
+#endif
+}
+
+inline void pad2(int n, memory_buf_t &dest) {
+    if (n >= 0 && n < 100)  // 0-99
+    {
+        dest.push_back(static_cast<char>('0' + n / 10));
+        dest.push_back(static_cast<char>('0' + n % 10));
+    } else  // unlikely, but just in case, let fmt deal with it
+    {
+        fmt_lib::format_to(std::back_inserter(dest), SPDLOG_FMT_STRING("{:02}"), n);
+    }
+}
+
+template <typename T>
+inline void pad_uint(T n, unsigned int width, memory_buf_t &dest) {
+    static_assert(std::is_unsigned<T>::value, "pad_uint must get unsigned T");
+    for (auto digits = count_digits(n); digits < width; digits++) {
+        dest.push_back('0');
+    }
+    append_int(n, dest);
+}
+
+template <typename T>
+inline void pad3(T n, memory_buf_t &dest) {
+    static_assert(std::is_unsigned<T>::value, "pad3 must get unsigned T");
+    if (n < 1000) {
+        dest.push_back(static_cast<char>(n / 100 + '0'));
+        n = n % 100;
+        dest.push_back(static_cast<char>((n / 10) + '0'));
+        dest.push_back(static_cast<char>((n % 10) + '0'));
+    } else {
+        append_int(n, dest);
+    }
+}
+
+template <typename T>
+inline void pad6(T n, memory_buf_t &dest) {
+    pad_uint(n, 6, dest);
+}
+
+template <typename T>
+inline void pad9(T n, memory_buf_t &dest) {
+    pad_uint(n, 9, dest);
+}
+
+// return fraction of a second of the given time_point.
+// e.g.
+// fraction<std::milliseconds>(tp) -> will return the millis part of the second
+template <typename ToDuration>
+inline ToDuration time_fraction(log_clock::time_point tp) {
+    using std::chrono::duration_cast;
+    using std::chrono::seconds;
+    auto duration = tp.time_since_epoch();
+    auto secs = duration_cast<seconds>(duration);
+    return duration_cast<ToDuration>(duration) - duration_cast<ToDuration>(secs);
+}
+
+}  // namespace fmt_helper
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/log_msg-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/log_msg-inl.h
@@ -0,0 +1,44 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/log_msg.h>
+#endif
+
+#include <spdlog/details/os.h>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time,
+                               spdlog::source_loc loc,
+                               string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : logger_name(a_logger_name),
+      level(lvl),
+      time(log_time)
+#ifndef SPDLOG_NO_THREAD_ID
+      ,
+      thread_id(os::thread_id())
+#endif
+      ,
+      source(loc),
+      payload(msg) {
+}
+
+SPDLOG_INLINE log_msg::log_msg(spdlog::source_loc loc,
+                               string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : log_msg(os::now(), loc, a_logger_name, lvl, msg) {}
+
+SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : log_msg(os::now(), source_loc{}, a_logger_name, lvl, msg) {}
+
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/log_msg.h
+++ b/csrc/idle_offload/include/spdlog/details/log_msg.h
@@ -0,0 +1,40 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <string>
+
+namespace spdlog {
+namespace details {
+struct SPDLOG_API log_msg {
+    log_msg() = default;
+    log_msg(log_clock::time_point log_time,
+            source_loc loc,
+            string_view_t logger_name,
+            level::level_enum lvl,
+            string_view_t msg);
+    log_msg(source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg);
+    log_msg(string_view_t logger_name, level::level_enum lvl, string_view_t msg);
+    log_msg(const log_msg &other) = default;
+    log_msg &operator=(const log_msg &other) = default;
+
+    string_view_t logger_name;
+    level::level_enum level{level::off};
+    log_clock::time_point time;
+    size_t thread_id{0};
+
+    // wrapping the formatted text with color (updated by pattern_formatter).
+    mutable size_t color_range_start{0};
+    mutable size_t color_range_end{0};
+
+    source_loc source;
+    string_view_t payload;
+};
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "log_msg-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/log_msg_buffer-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/log_msg_buffer-inl.h
@@ -0,0 +1,54 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/log_msg_buffer.h>
+#endif
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg &orig_msg)
+    : log_msg{orig_msg} {
+    buffer.append(logger_name.begin(), logger_name.end());
+    buffer.append(payload.begin(), payload.end());
+    update_string_views();
+}
+
+SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg_buffer &other)
+    : log_msg{other} {
+    buffer.append(logger_name.begin(), logger_name.end());
+    buffer.append(payload.begin(), payload.end());
+    update_string_views();
+}
+
+SPDLOG_INLINE log_msg_buffer::log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT
+    : log_msg{other},
+      buffer{std::move(other.buffer)} {
+    update_string_views();
+}
+
+SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(const log_msg_buffer &other) {
+    log_msg::operator=(other);
+    buffer.clear();
+    buffer.append(other.buffer.data(), other.buffer.data() + other.buffer.size());
+    update_string_views();
+    return *this;
+}
+
+SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT {
+    log_msg::operator=(other);
+    buffer = std::move(other.buffer);
+    update_string_views();
+    return *this;
+}
+
+SPDLOG_INLINE void log_msg_buffer::update_string_views() {
+    logger_name = string_view_t{buffer.data(), logger_name.size()};
+    payload = string_view_t{buffer.data() + logger_name.size(), payload.size()};
+}
+
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/log_msg_buffer.h
+++ b/csrc/idle_offload/include/spdlog/details/log_msg_buffer.h
@@ -0,0 +1,32 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/log_msg.h>
+
+namespace spdlog {
+namespace details {
+
+// Extend log_msg with internal buffer to store its payload.
+// This is needed since log_msg holds string_views that points to stack data.
+
+class SPDLOG_API log_msg_buffer : public log_msg {
+    memory_buf_t buffer;
+    void update_string_views();
+
+public:
+    log_msg_buffer() = default;
+    explicit log_msg_buffer(const log_msg &orig_msg);
+    log_msg_buffer(const log_msg_buffer &other);
+    log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
+    log_msg_buffer &operator=(const log_msg_buffer &other);
+    log_msg_buffer &operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
+};
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "log_msg_buffer-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/mpmc_blocking_q.h
+++ b/csrc/idle_offload/include/spdlog/details/mpmc_blocking_q.h
@@ -0,0 +1,177 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+// multi producer-multi consumer blocking queue.
+// enqueue(..) - will block until room found to put the new message.
+// enqueue_nowait(..) - enqueue immediately. overruns oldest message if no 
+// room left.
+// dequeue_for(..) - will block until the queue is not empty or timeout have
+// passed.
+
+#include <spdlog/details/circular_q.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+namespace spdlog {
+namespace details {
+
+template <typename T>
+class mpmc_blocking_queue {
+public:
+    using item_type = T;
+    explicit mpmc_blocking_queue(size_t max_items)
+        : q_(max_items) {}
+
+#ifndef __MINGW32__
+    // try to enqueue and block if no room left
+    void enqueue(T &&item) {
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex_);
+            pop_cv_.wait(lock, [this] { return !this->q_.full(); });
+            q_.push_back(std::move(item));
+        }
+        push_cv_.notify_one();
+    }
+
+    // enqueue immediately. overrun oldest message in the queue if no room left.
+    void enqueue_nowait(T &&item) {
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex_);
+            q_.push_back(std::move(item));
+        }
+        push_cv_.notify_one();
+    }
+
+    void enqueue_if_have_room(T &&item) {
+        bool pushed = false;
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex_);
+            if (!q_.full()) {
+                q_.push_back(std::move(item));
+                pushed = true;
+            }
+        }
+
+        if (pushed) {
+            push_cv_.notify_one();
+        } else {
+            ++discard_counter_;
+        }
+    }
+
+    // dequeue with a timeout.
+    // Return true, if succeeded dequeue item, false otherwise
+    bool dequeue_for(T &popped_item, std::chrono::milliseconds wait_duration) {
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex_);
+            if (!push_cv_.wait_for(lock, wait_duration, [this] { return !this->q_.empty(); })) {
+                return false;
+            }
+            popped_item = std::move(q_.front());
+            q_.pop_front();
+        }
+        pop_cv_.notify_one();
+        return true;
+    }
+
+    // blocking dequeue without a timeout.
+    void dequeue(T &popped_item) {
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex_);
+            push_cv_.wait(lock, [this] { return !this->q_.empty(); });
+            popped_item = std::move(q_.front());
+            q_.pop_front();
+        }
+        pop_cv_.notify_one();
+    }
+
+#else
+    // apparently mingw deadlocks if the mutex is released before cv.notify_one(),
+    // so release the mutex at the very end each function.
+
+    // try to enqueue and block if no room left
+    void enqueue(T &&item) {
+        std::unique_lock<std::mutex> lock(queue_mutex_);
+        pop_cv_.wait(lock, [this] { return !this->q_.full(); });
+        q_.push_back(std::move(item));
+        push_cv_.notify_one();
+    }
+
+    // enqueue immediately. overrun oldest message in the queue if no room left.
+    void enqueue_nowait(T &&item) {
+        std::unique_lock<std::mutex> lock(queue_mutex_);
+        q_.push_back(std::move(item));
+        push_cv_.notify_one();
+    }
+
+    void enqueue_if_have_room(T &&item) {
+        bool pushed = false;
+        std::unique_lock<std::mutex> lock(queue_mutex_);
+        if (!q_.full()) {
+            q_.push_back(std::move(item));
+            pushed = true;
+        }
+
+        if (pushed) {
+            push_cv_.notify_one();
+        } else {
+            ++discard_counter_;
+        }
+    }
+
+    // dequeue with a timeout.
+    // Return true, if succeeded dequeue item, false otherwise
+    bool dequeue_for(T &popped_item, std::chrono::milliseconds wait_duration) {
+        std::unique_lock<std::mutex> lock(queue_mutex_);
+        if (!push_cv_.wait_for(lock, wait_duration, [this] { return !this->q_.empty(); })) {
+            return false;
+        }
+        popped_item = std::move(q_.front());
+        q_.pop_front();
+        pop_cv_.notify_one();
+        return true;
+    }
+
+    // blocking dequeue without a timeout.
+    void dequeue(T &popped_item) {
+        std::unique_lock<std::mutex> lock(queue_mutex_);
+        push_cv_.wait(lock, [this] { return !this->q_.empty(); });
+        popped_item = std::move(q_.front());
+        q_.pop_front();
+        pop_cv_.notify_one();
+    }
+
+#endif
+
+    size_t overrun_counter() {
+        std::lock_guard<std::mutex> lock(queue_mutex_);
+        return q_.overrun_counter();
+    }
+
+    size_t discard_counter() { return discard_counter_.load(std::memory_order_relaxed); }
+
+    size_t size() {
+        std::lock_guard<std::mutex> lock(queue_mutex_);
+        return q_.size();
+    }
+
+    void reset_overrun_counter() {
+        std::lock_guard<std::mutex> lock(queue_mutex_);
+        q_.reset_overrun_counter();
+    }
+
+    void reset_discard_counter() { discard_counter_.store(0, std::memory_order_relaxed); }
+
+private:
+    std::mutex queue_mutex_;
+    std::condition_variable push_cv_;
+    std::condition_variable pop_cv_;
+    spdlog::details::circular_q<T> q_;
+    std::atomic<size_t> discard_counter_{0};
+};
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/null_mutex.h
+++ b/csrc/idle_offload/include/spdlog/details/null_mutex.h
@@ -0,0 +1,35 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <atomic>
+#include <utility>
+// null, no cost dummy "mutex" and dummy "atomic" int
+
+namespace spdlog {
+namespace details {
+struct null_mutex {
+    void lock() const {}
+    void unlock() const {}
+};
+
+struct null_atomic_int {
+    int value;
+    null_atomic_int() = default;
+
+    explicit null_atomic_int(int new_value)
+        : value(new_value) {}
+
+    int load(std::memory_order = std::memory_order_relaxed) const { return value; }
+
+    void store(int new_value, std::memory_order = std::memory_order_relaxed) { value = new_value; }
+
+    int exchange(int new_value, std::memory_order = std::memory_order_relaxed) {
+        std::swap(new_value, value);
+        return new_value;  // return value before the call
+    }
+};
+
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/os-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/os-inl.h
@@ -0,0 +1,572 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/os.h>
+#endif
+
+#include <spdlog/common.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <thread>
+
+#ifdef _WIN32
+#include <spdlog/details/windows_include.h>
+#include <io.h>       // for _get_osfhandle, _isatty, _fileno
+#include <process.h>  // for _get_pid
+
+#ifdef __MINGW32__
+#include <share.h>
+#endif
+
+#if defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)
+#include <cassert>
+#include <limits>
+#endif
+
+#include <direct.h>  // for _mkdir/_wmkdir
+
+#else  // unix
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#ifdef __linux__
+#include <sys/syscall.h>  //Use gettid() syscall under linux to get thread id
+
+#elif defined(_AIX)
+#include <pthread.h>  // for pthread_getthrds_np
+
+#elif defined(__DragonFly__) || defined(__FreeBSD__)
+#include <pthread_np.h>  // for pthread_getthreadid_np
+
+#elif defined(__NetBSD__)
+#include <lwp.h>  // for _lwp_self
+
+#elif defined(__sun)
+#include <thread.h>  // for thr_self
+#endif
+
+#endif  // unix
+
+#if defined __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+#ifndef __has_feature       // Clang - feature checking macros.
+#define __has_feature(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+namespace spdlog {
+namespace details {
+namespace os {
+
+SPDLOG_INLINE spdlog::log_clock::time_point now() SPDLOG_NOEXCEPT {
+#if defined __linux__ && defined SPDLOG_CLOCK_COARSE
+    timespec ts;
+    ::clock_gettime(CLOCK_REALTIME_COARSE, &ts);
+    return std::chrono::time_point<log_clock, typename log_clock::duration>(
+        std::chrono::duration_cast<typename log_clock::duration>(
+            std::chrono::seconds(ts.tv_sec) + std::chrono::nanoseconds(ts.tv_nsec)));
+
+#else
+    return log_clock::now();
+#endif
+}
+SPDLOG_INLINE std::tm localtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    std::tm tm;
+    ::localtime_s(&tm, &time_tt);
+#else
+    std::tm tm;
+    ::localtime_r(&time_tt, &tm);
+#endif
+    return tm;
+}
+
+SPDLOG_INLINE std::tm localtime() SPDLOG_NOEXCEPT {
+    std::time_t now_t = ::time(nullptr);
+    return localtime(now_t);
+}
+
+SPDLOG_INLINE std::tm gmtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    std::tm tm;
+    ::gmtime_s(&tm, &time_tt);
+#else
+    std::tm tm;
+    ::gmtime_r(&time_tt, &tm);
+#endif
+    return tm;
+}
+
+SPDLOG_INLINE std::tm gmtime() SPDLOG_NOEXCEPT {
+    std::time_t now_t = ::time(nullptr);
+    return gmtime(now_t);
+}
+
+// fopen_s on non windows for writing
+SPDLOG_INLINE bool fopen_s(FILE **fp, const filename_t &filename, const filename_t &mode) {
+#ifdef _WIN32
+#ifdef SPDLOG_WCHAR_FILENAMES
+    *fp = ::_wfsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
+#else
+    *fp = ::_fsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
+#endif
+#if defined(SPDLOG_PREVENT_CHILD_FD)
+    if (*fp != nullptr) {
+        auto file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(::_fileno(*fp)));
+        if (!::SetHandleInformation(file_handle, HANDLE_FLAG_INHERIT, 0)) {
+            ::fclose(*fp);
+            *fp = nullptr;
+        }
+    }
+#endif
+#else  // unix
+#if defined(SPDLOG_PREVENT_CHILD_FD)
+    const int mode_flag = mode == SPDLOG_FILENAME_T("ab") ? O_APPEND : O_TRUNC;
+    const int fd =
+        ::open((filename.c_str()), O_CREAT | O_WRONLY | O_CLOEXEC | mode_flag, mode_t(0644));
+    if (fd == -1) {
+        return true;
+    }
+    *fp = ::fdopen(fd, mode.c_str());
+    if (*fp == nullptr) {
+        ::close(fd);
+    }
+#else
+    *fp = ::fopen((filename.c_str()), mode.c_str());
+#endif
+#endif
+
+    return *fp == nullptr;
+}
+
+SPDLOG_INLINE int remove(const filename_t &filename) SPDLOG_NOEXCEPT {
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+    return ::_wremove(filename.c_str());
+#else
+    return std::remove(filename.c_str());
+#endif
+}
+
+SPDLOG_INLINE int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
+    return path_exists(filename) ? remove(filename) : 0;
+}
+
+SPDLOG_INLINE int rename(const filename_t &filename1, const filename_t &filename2) SPDLOG_NOEXCEPT {
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+    return ::_wrename(filename1.c_str(), filename2.c_str());
+#else
+    return std::rename(filename1.c_str(), filename2.c_str());
+#endif
+}
+
+// Return true if path exists (file or directory)
+SPDLOG_INLINE bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    struct _stat buffer;
+#ifdef SPDLOG_WCHAR_FILENAMES
+    return (::_wstat(filename.c_str(), &buffer) == 0);
+#else
+    return (::_stat(filename.c_str(), &buffer) == 0);
+#endif
+#else  // common linux/unix all have the stat system call
+    struct stat buffer;
+    return (::stat(filename.c_str(), &buffer) == 0);
+#endif
+}
+
+#ifdef _MSC_VER
+// avoid warning about unreachable statement at the end of filesize()
+#pragma warning(push)
+#pragma warning(disable : 4702)
+#endif
+
+// Return file size according to open FILE* object
+SPDLOG_INLINE size_t filesize(FILE *f) {
+    if (f == nullptr) {
+        throw_spdlog_ex("Failed getting file size. fd is null");
+    }
+#if defined(_WIN32) && !defined(__CYGWIN__)
+    int fd = ::_fileno(f);
+#if defined(_WIN64)  // 64 bits
+    __int64 ret = ::_filelengthi64(fd);
+    if (ret >= 0) {
+        return static_cast<size_t>(ret);
+    }
+
+#else  // windows 32 bits
+    long ret = ::_filelength(fd);
+    if (ret >= 0) {
+        return static_cast<size_t>(ret);
+    }
+#endif
+
+#else  // unix
+// OpenBSD and AIX doesn't compile with :: before the fileno(..)
+#if defined(__OpenBSD__) || defined(_AIX)
+    int fd = fileno(f);
+#else
+    int fd = ::fileno(f);
+#endif
+// 64 bits(but not in osx, linux/musl or cygwin, where fstat64 is deprecated)
+#if ((defined(__linux__) && defined(__GLIBC__)) || defined(__sun) || defined(_AIX)) && \
+    (defined(__LP64__) || defined(_LP64))
+    struct stat64 st;
+    if (::fstat64(fd, &st) == 0) {
+        return static_cast<size_t>(st.st_size);
+    }
+#else  // other unix or linux 32 bits or cygwin
+    struct stat st;
+    if (::fstat(fd, &st) == 0) {
+        return static_cast<size_t>(st.st_size);
+    }
+#endif
+#endif
+    throw_spdlog_ex("Failed getting file size from fd", errno);
+    return 0;  // will not be reached.
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Return utc offset in minutes or throw spdlog_ex on failure
+#if !defined(SPDLOG_NO_TZ_OFFSET)
+SPDLOG_INLINE int utc_minutes_offset(const std::tm &tm) {
+#ifdef _WIN32
+#if _WIN32_WINNT < _WIN32_WINNT_WS08
+    TIME_ZONE_INFORMATION tzinfo;
+    auto rv = ::GetTimeZoneInformation(&tzinfo);
+#else
+    DYNAMIC_TIME_ZONE_INFORMATION tzinfo;
+    auto rv = ::GetDynamicTimeZoneInformation(&tzinfo);
+#endif
+    if (rv == TIME_ZONE_ID_INVALID) throw_spdlog_ex("Failed getting timezone info. ", errno);
+
+    int offset = -tzinfo.Bias;
+    if (tm.tm_isdst) {
+        offset -= tzinfo.DaylightBias;
+    } else {
+        offset -= tzinfo.StandardBias;
+    }
+    return offset;
+#else
+    auto offset_seconds = tm.tm_gmtoff;
+    return static_cast<int>(offset_seconds / 60);
+#endif
+}
+#endif  // SPDLOG_NO_TZ_OFFSET
+
+// Return current thread id as size_t
+// It exists because the std::this_thread::get_id() is much slower(especially
+// under VS 2013)
+SPDLOG_INLINE size_t _thread_id() SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return static_cast<size_t>(::GetCurrentThreadId());
+#elif defined(__linux__)
+#if defined(__ANDROID__) && defined(__ANDROID_API__) && (__ANDROID_API__ < 21)
+#define SYS_gettid __NR_gettid
+#endif
+    return static_cast<size_t>(::syscall(SYS_gettid));
+#elif defined(_AIX)
+    struct __pthrdsinfo buf;
+    int reg_size = 0;
+    pthread_t pt = pthread_self();
+    int retval = pthread_getthrds_np(&pt, PTHRDSINFO_QUERY_TID, &buf, sizeof(buf), NULL, &reg_size);
+    int tid = (!retval) ? buf.__pi_tid : 0;
+    return static_cast<size_t>(tid);
+#elif defined(__DragonFly__) || defined(__FreeBSD__)
+    return static_cast<size_t>(::pthread_getthreadid_np());
+#elif defined(__NetBSD__)
+    return static_cast<size_t>(::_lwp_self());
+#elif defined(__OpenBSD__)
+    return static_cast<size_t>(::getthrid());
+#elif defined(__sun)
+    return static_cast<size_t>(::thr_self());
+#elif __APPLE__
+    uint64_t tid;
+// There is no pthread_threadid_np prior to Mac OS X 10.6, and it is not supported on any PPC,
+// including 10.6.8 Rosetta. __POWERPC__ is Apple-specific define encompassing ppc and ppc64.
+#ifdef MAC_OS_X_VERSION_MAX_ALLOWED
+    {
+#if (MAC_OS_X_VERSION_MAX_ALLOWED < 1060) || defined(__POWERPC__)
+        tid = pthread_mach_thread_np(pthread_self());
+#elif MAC_OS_X_VERSION_MIN_REQUIRED < 1060
+        if (&pthread_threadid_np) {
+            pthread_threadid_np(nullptr, &tid);
+        } else {
+            tid = pthread_mach_thread_np(pthread_self());
+        }
+#else
+        pthread_threadid_np(nullptr, &tid);
+#endif
+    }
+#else
+    pthread_threadid_np(nullptr, &tid);
+#endif
+    return static_cast<size_t>(tid);
+#else  // Default to standard C++11 (other Unix)
+    return static_cast<size_t>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+#endif
+}
+
+// Return current thread id as size_t (from thread local storage)
+SPDLOG_INLINE size_t thread_id() SPDLOG_NOEXCEPT {
+#if defined(SPDLOG_NO_TLS)
+    return _thread_id();
+#else  // cache thread id in tls
+    static thread_local const size_t tid = _thread_id();
+    return tid;
+#endif
+}
+
+// This is avoid msvc issue in sleep_for that happens if the clock changes.
+// See https://github.com/gabime/spdlog/issues/609
+SPDLOG_INLINE void sleep_for_millis(unsigned int milliseconds) SPDLOG_NOEXCEPT {
+#if defined(_WIN32)
+    ::Sleep(milliseconds);
+#else
+    std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
+#endif
+}
+
+// wchar support for windows file names (SPDLOG_WCHAR_FILENAMES must be defined)
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) {
+    memory_buf_t buf;
+    wstr_to_utf8buf(filename, buf);
+    return SPDLOG_BUF_TO_STRING(buf);
+}
+#else
+SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) { return filename; }
+#endif
+
+SPDLOG_INLINE int pid() SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return conditional_static_cast<int>(::GetCurrentProcessId());
+#else
+    return conditional_static_cast<int>(::getpid());
+#endif
+}
+
+// Determine if the terminal supports colors
+// Based on: https://github.com/agauniyal/rang/
+SPDLOG_INLINE bool is_color_terminal() SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return true;
+#else
+
+    static const bool result = []() {
+        const char *env_colorterm_p = std::getenv("COLORTERM");
+        if (env_colorterm_p != nullptr) {
+            return true;
+        }
+
+        static constexpr std::array<const char *, 16> terms = {
+            {"ansi", "color", "console", "cygwin", "gnome", "konsole", "kterm", "linux", "msys",
+             "putty", "rxvt", "screen", "vt100", "xterm", "alacritty", "vt102"}};
+
+        const char *env_term_p = std::getenv("TERM");
+        if (env_term_p == nullptr) {
+            return false;
+        }
+
+        return std::any_of(terms.begin(), terms.end(), [&](const char *term) {
+            return std::strstr(env_term_p, term) != nullptr;
+        });
+    }();
+
+    return result;
+#endif
+}
+
+// Determine if the terminal attached
+// Source: https://github.com/agauniyal/rang/
+SPDLOG_INLINE bool in_terminal(FILE *file) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return ::_isatty(_fileno(file)) != 0;
+#else
+    return ::isatty(fileno(file)) != 0;
+#endif
+}
+
+#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) && defined(_WIN32)
+SPDLOG_INLINE void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target) {
+    if (wstr.size() > static_cast<size_t>((std::numeric_limits<int>::max)()) / 4 - 1) {
+        throw_spdlog_ex("UTF-16 string is too big to be converted to UTF-8");
+    }
+
+    int wstr_size = static_cast<int>(wstr.size());
+    if (wstr_size == 0) {
+        target.resize(0);
+        return;
+    }
+
+    int result_size = static_cast<int>(target.capacity());
+    if ((wstr_size + 1) * 4 > result_size) {
+        result_size =
+            ::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size, NULL, 0, NULL, NULL);
+    }
+
+    if (result_size > 0) {
+        target.resize(result_size);
+        result_size = ::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size, target.data(),
+                                            result_size, NULL, NULL);
+
+        if (result_size > 0) {
+            target.resize(result_size);
+            return;
+        }
+    }
+
+    throw_spdlog_ex(
+        fmt_lib::format("WideCharToMultiByte failed. Last error: {}", ::GetLastError()));
+}
+
+SPDLOG_INLINE void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target) {
+    if (str.size() > static_cast<size_t>((std::numeric_limits<int>::max)()) - 1) {
+        throw_spdlog_ex("UTF-8 string is too big to be converted to UTF-16");
+    }
+
+    int str_size = static_cast<int>(str.size());
+    if (str_size == 0) {
+        target.resize(0);
+        return;
+    }
+
+    // find the size to allocate for the result buffer
+    int result_size = ::MultiByteToWideChar(CP_UTF8, 0, str.data(), str_size, NULL, 0);
+
+    if (result_size > 0) {
+        target.resize(result_size);
+        result_size =
+            ::MultiByteToWideChar(CP_UTF8, 0, str.data(), str_size, target.data(), result_size);
+        if (result_size > 0) {
+            assert(result_size == static_cast<int>(target.size()));
+            return;
+        }
+    }
+
+    throw_spdlog_ex(
+        fmt_lib::format("MultiByteToWideChar failed. Last error: {}", ::GetLastError()));
+}
+#endif  // (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) &&
+        // defined(_WIN32)
+
+// return true on success
+static SPDLOG_INLINE bool mkdir_(const filename_t &path) {
+#ifdef _WIN32
+#ifdef SPDLOG_WCHAR_FILENAMES
+    return ::_wmkdir(path.c_str()) == 0;
+#else
+    return ::_mkdir(path.c_str()) == 0;
+#endif
+#else
+    return ::mkdir(path.c_str(), mode_t(0755)) == 0;
+#endif
+}
+
+// create the given directory - and all directories leading to it
+// return true on success or if the directory already exists
+SPDLOG_INLINE bool create_dir(const filename_t &path) {
+    if (path_exists(path)) {
+        return true;
+    }
+
+    if (path.empty()) {
+        return false;
+    }
+
+    size_t search_offset = 0;
+    do {
+        auto token_pos = path.find_first_of(folder_seps_filename, search_offset);
+        // treat the entire path as a folder if no folder separator not found
+        if (token_pos == filename_t::npos) {
+            token_pos = path.size();
+        }
+
+        auto subdir = path.substr(0, token_pos);
+#ifdef _WIN32
+        // if subdir is just a drive letter, add a slash e.g. "c:"=>"c:\",
+        // otherwise path_exists(subdir) returns false (issue #3079)
+        const bool is_drive = subdir.length() == 2 && subdir[1] == ':';
+        if (is_drive) {
+            subdir += '\\';
+            token_pos++;
+        }
+#endif
+
+        if (!subdir.empty() && !path_exists(subdir) && !mkdir_(subdir)) {
+            return false;  // return error if failed creating dir
+        }
+        search_offset = token_pos + 1;
+    } while (search_offset < path.size());
+
+    return true;
+}
+
+// Return directory name from given path or empty string
+// "abc/file" => "abc"
+// "abc/" => "abc"
+// "abc" => ""
+// "abc///" => "abc//"
+SPDLOG_INLINE filename_t dir_name(const filename_t &path) {
+    auto pos = path.find_last_of(folder_seps_filename);
+    return pos != filename_t::npos ? path.substr(0, pos) : filename_t{};
+}
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif  // _MSC_VER
+std::string SPDLOG_INLINE getenv(const char *field) {
+#if defined(_MSC_VER) && defined(WINAPI_FAMILY) && defined(WINAPI_FAMILY_DESKTOP_APP) && \
+    (WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP)
+    return std::string{};  // not supported under uwp
+#else
+    char *buf = std::getenv(field);
+    return buf ? buf : std::string{};
+#endif
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif  // _MSC_VER
+
+// Do fsync by FILE handlerpointer
+// Return true on success
+SPDLOG_INLINE bool fsync(FILE *fp) {
+#ifdef _WIN32
+    return FlushFileBuffers(reinterpret_cast<HANDLE>(_get_osfhandle(_fileno(fp)))) != 0;
+#else
+    return ::fsync(fileno(fp)) == 0;
+#endif
+}
+
+// Do non-locking fwrite if possible by the os or use the regular locking fwrite
+// Return true on success.
+SPDLOG_INLINE bool fwrite_bytes(const void *ptr, const size_t n_bytes, FILE *fp) {
+#if defined(_WIN32) && defined(SPDLOG_FWRITE_UNLOCKED)
+    return _fwrite_nolock(ptr, 1, n_bytes, fp) == n_bytes;
+#elif defined(SPDLOG_FWRITE_UNLOCKED)
+    return ::fwrite_unlocked(ptr, 1, n_bytes, fp) == n_bytes;
+#else
+    return std::fwrite(ptr, 1, n_bytes, fp) == n_bytes;
+#endif
+}
+
+}  // namespace os
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/os.h
+++ b/csrc/idle_offload/include/spdlog/details/os.h
@@ -0,0 +1,127 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <ctime>  // std::time_t
+#include <spdlog/common.h>
+
+namespace spdlog {
+namespace details {
+namespace os {
+
+SPDLOG_API spdlog::log_clock::time_point now() SPDLOG_NOEXCEPT;
+
+SPDLOG_API std::tm localtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT;
+
+SPDLOG_API std::tm localtime() SPDLOG_NOEXCEPT;
+
+SPDLOG_API std::tm gmtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT;
+
+SPDLOG_API std::tm gmtime() SPDLOG_NOEXCEPT;
+
+// eol definition
+#if !defined(SPDLOG_EOL)
+#ifdef _WIN32
+#define SPDLOG_EOL "\r\n"
+#else
+#define SPDLOG_EOL "\n"
+#endif
+#endif
+
+SPDLOG_CONSTEXPR static const char *default_eol = SPDLOG_EOL;
+
+// folder separator
+#if !defined(SPDLOG_FOLDER_SEPS)
+#ifdef _WIN32
+#define SPDLOG_FOLDER_SEPS "\\/"
+#else
+#define SPDLOG_FOLDER_SEPS "/"
+#endif
+#endif
+
+SPDLOG_CONSTEXPR static const char folder_seps[] = SPDLOG_FOLDER_SEPS;
+SPDLOG_CONSTEXPR static const filename_t::value_type folder_seps_filename[] =
+    SPDLOG_FILENAME_T(SPDLOG_FOLDER_SEPS);
+
+// fopen_s on non windows for writing
+SPDLOG_API bool fopen_s(FILE **fp, const filename_t &filename, const filename_t &mode);
+
+// Remove filename. return 0 on success
+SPDLOG_API int remove(const filename_t &filename) SPDLOG_NOEXCEPT;
+
+// Remove file if exists. return 0 on success
+// Note: Non atomic (might return failure to delete if concurrently deleted by other process/thread)
+SPDLOG_API int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT;
+
+SPDLOG_API int rename(const filename_t &filename1, const filename_t &filename2) SPDLOG_NOEXCEPT;
+
+// Return if file exists.
+SPDLOG_API bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT;
+
+// Return file size according to open FILE* object
+SPDLOG_API size_t filesize(FILE *f);
+
+// Return utc offset in minutes or throw spdlog_ex on failure
+SPDLOG_API int utc_minutes_offset(const std::tm &tm = details::os::localtime());
+
+// Return current thread id as size_t
+// It exists because the std::this_thread::get_id() is much slower(especially
+// under VS 2013)
+SPDLOG_API size_t _thread_id() SPDLOG_NOEXCEPT;
+
+// Return current thread id as size_t (from thread local storage)
+SPDLOG_API size_t thread_id() SPDLOG_NOEXCEPT;
+
+// This is avoid msvc issue in sleep_for that happens if the clock changes.
+// See https://github.com/gabime/spdlog/issues/609
+SPDLOG_API void sleep_for_millis(unsigned int milliseconds) SPDLOG_NOEXCEPT;
+
+SPDLOG_API std::string filename_to_str(const filename_t &filename);
+
+SPDLOG_API int pid() SPDLOG_NOEXCEPT;
+
+// Determine if the terminal supports colors
+// Source: https://github.com/agauniyal/rang/
+SPDLOG_API bool is_color_terminal() SPDLOG_NOEXCEPT;
+
+// Determine if the terminal attached
+// Source: https://github.com/agauniyal/rang/
+SPDLOG_API bool in_terminal(FILE *file) SPDLOG_NOEXCEPT;
+
+#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) && defined(_WIN32)
+SPDLOG_API void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target);
+
+SPDLOG_API void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target);
+#endif
+
+// Return directory name from given path or empty string
+// "abc/file" => "abc"
+// "abc/" => "abc"
+// "abc" => ""
+// "abc///" => "abc//"
+SPDLOG_API filename_t dir_name(const filename_t &path);
+
+// Create a dir from the given path.
+// Return true if succeeded or if this dir already exists.
+SPDLOG_API bool create_dir(const filename_t &path);
+
+// non thread safe, cross platform getenv/getenv_s
+// return empty string if field not found
+SPDLOG_API std::string getenv(const char *field);
+
+// Do fsync by FILE objectpointer.
+// Return true on success.
+SPDLOG_API bool fsync(FILE *fp);
+
+// Do non-locking fwrite if possible by the os or use the regular locking fwrite
+// Return true on success.
+SPDLOG_API bool fwrite_bytes(const void *ptr, const size_t n_bytes, FILE *fp);
+
+}  // namespace os
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "os-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/periodic_worker-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/periodic_worker-inl.h
@@ -0,0 +1,26 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/periodic_worker.h>
+#endif
+
+namespace spdlog {
+namespace details {
+
+// stop the worker thread and join it
+SPDLOG_INLINE periodic_worker::~periodic_worker() {
+    if (worker_thread_.joinable()) {
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            active_ = false;
+        }
+        cv_.notify_one();
+        worker_thread_.join();
+    }
+}
+
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/periodic_worker.h
+++ b/csrc/idle_offload/include/spdlog/details/periodic_worker.h
@@ -0,0 +1,58 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+// periodic worker thread - periodically executes the given callback function.
+//
+// RAII over the owned thread:
+//    creates the thread on construction.
+//    stops and joins the thread on destruction (if the thread is executing a callback, wait for it
+//    to finish first).
+
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <thread>
+namespace spdlog {
+namespace details {
+
+class SPDLOG_API periodic_worker {
+public:
+    template <typename Rep, typename Period>
+    periodic_worker(const std::function<void()> &callback_fun,
+                    std::chrono::duration<Rep, Period> interval) {
+        active_ = (interval > std::chrono::duration<Rep, Period>::zero());
+        if (!active_) {
+            return;
+        }
+
+        worker_thread_ = std::thread([this, callback_fun, interval]() {
+            for (;;) {
+                std::unique_lock<std::mutex> lock(this->mutex_);
+                if (this->cv_.wait_for(lock, interval, [this] { return !this->active_; })) {
+                    return;  // active_ == false, so exit this thread
+                }
+                callback_fun();
+            }
+        });
+    }
+    std::thread &get_thread() { return worker_thread_; }
+    periodic_worker(const periodic_worker &) = delete;
+    periodic_worker &operator=(const periodic_worker &) = delete;
+    // stop the worker thread and join it
+    ~periodic_worker();
+
+private:
+    bool active_;
+    std::thread worker_thread_;
+    std::mutex mutex_;
+    std::condition_variable cv_;
+};
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "periodic_worker-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/registry-inl.h
+++ b/csrc/idle_offload/include/spdlog/details/registry-inl.h
@@ -0,0 +1,270 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+#include <spdlog/details/registry.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/details/periodic_worker.h>
+#include <spdlog/logger.h>
+#include <spdlog/pattern_formatter.h>
+
+#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
+// support for the default stdout color logger
+#ifdef _WIN32
+#include <spdlog/sinks/wincolor_sink.h>
+#else
+#include <spdlog/sinks/ansicolor_sink.h>
+#endif
+#endif  // SPDLOG_DISABLE_DEFAULT_LOGGER
+
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE registry::registry()
+    : formatter_(new pattern_formatter()) {
+#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
+// create default logger (ansicolor_stdout_sink_mt or wincolor_stdout_sink_mt in windows).
+#ifdef _WIN32
+    auto color_sink = std::make_shared<sinks::wincolor_stdout_sink_mt>();
+#else
+    auto color_sink = std::make_shared<sinks::ansicolor_stdout_sink_mt>();
+#endif
+
+    const char *default_logger_name = "";
+    default_logger_ = std::make_shared<spdlog::logger>(default_logger_name, std::move(color_sink));
+    loggers_[default_logger_name] = default_logger_;
+
+#endif  // SPDLOG_DISABLE_DEFAULT_LOGGER
+}
+
+SPDLOG_INLINE registry::~registry() = default;
+
+SPDLOG_INLINE void registry::register_logger(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    register_logger_(std::move(new_logger));
+}
+
+SPDLOG_INLINE void registry::register_or_replace(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    register_or_replace_(std::move(new_logger));
+}
+
+SPDLOG_INLINE void registry::initialize_logger(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    new_logger->set_formatter(formatter_->clone());
+
+    if (err_handler_) {
+        new_logger->set_error_handler(err_handler_);
+    }
+
+    // set new level according to previously configured level or default level
+    auto it = log_levels_.find(new_logger->name());
+    auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
+    new_logger->set_level(new_level);
+
+    new_logger->flush_on(flush_level_);
+
+    if (backtrace_n_messages_ > 0) {
+        new_logger->enable_backtrace(backtrace_n_messages_);
+    }
+
+    if (automatic_registration_) {
+        register_logger_(std::move(new_logger));
+    }
+}
+
+SPDLOG_INLINE std::shared_ptr<logger> registry::get(const std::string &logger_name) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    auto found = loggers_.find(logger_name);
+    return found == loggers_.end() ? nullptr : found->second;
+}
+
+SPDLOG_INLINE std::shared_ptr<logger> registry::default_logger() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    return default_logger_;
+}
+
+// Return raw ptr to the default logger.
+// To be used directly by the spdlog default api (e.g. spdlog::info)
+// This make the default API faster, but cannot be used concurrently with set_default_logger().
+// e.g do not call set_default_logger() from one thread while calling spdlog::info() from another.
+SPDLOG_INLINE logger *registry::get_default_raw() { return default_logger_.get(); }
+
+// set default logger.
+// the default logger is stored in default_logger_ (for faster retrieval) and in the loggers_ map.
+SPDLOG_INLINE void registry::set_default_logger(std::shared_ptr<logger> new_default_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    if (new_default_logger != nullptr) {
+        loggers_[new_default_logger->name()] = new_default_logger;
+    }
+    default_logger_ = std::move(new_default_logger);
+}
+
+SPDLOG_INLINE void registry::set_tp(std::shared_ptr<thread_pool> tp) {
+    std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
+    tp_ = std::move(tp);
+}
+
+SPDLOG_INLINE std::shared_ptr<thread_pool> registry::get_tp() {
+    std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
+    return tp_;
+}
+
+// Set global formatter. Each sink in each logger will get a clone of this object
+SPDLOG_INLINE void registry::set_formatter(std::unique_ptr<formatter> formatter) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    formatter_ = std::move(formatter);
+    for (auto &l : loggers_) {
+        l.second->set_formatter(formatter_->clone());
+    }
+}
+
+SPDLOG_INLINE void registry::enable_backtrace(size_t n_messages) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    backtrace_n_messages_ = n_messages;
+
+    for (auto &l : loggers_) {
+        l.second->enable_backtrace(n_messages);
+    }
+}
+
+SPDLOG_INLINE void registry::disable_backtrace() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    backtrace_n_messages_ = 0;
+    for (auto &l : loggers_) {
+        l.second->disable_backtrace();
+    }
+}
+
+SPDLOG_INLINE void registry::set_level(level::level_enum log_level) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->set_level(log_level);
+    }
+    global_log_level_ = log_level;
+}
+
+SPDLOG_INLINE void registry::flush_on(level::level_enum log_level) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->flush_on(log_level);
+    }
+    flush_level_ = log_level;
+}
+
+SPDLOG_INLINE void registry::set_error_handler(err_handler handler) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->set_error_handler(handler);
+    }
+    err_handler_ = std::move(handler);
+}
+
+SPDLOG_INLINE void registry::apply_all(
+    const std::function<void(const std::shared_ptr<logger>)> &fun) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        fun(l.second);
+    }
+}
+
+SPDLOG_INLINE void registry::flush_all() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->flush();
+    }
+}
+
+SPDLOG_INLINE void registry::drop(const std::string &logger_name) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    auto is_default_logger = default_logger_ && default_logger_->name() == logger_name;
+    loggers_.erase(logger_name);
+    if (is_default_logger) {
+        default_logger_.reset();
+    }
+}
+
+SPDLOG_INLINE void registry::drop_all() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    loggers_.clear();
+    default_logger_.reset();
+}
+
+// clean all resources and threads started by the registry
+SPDLOG_INLINE void registry::shutdown() {
+    {
+        std::lock_guard<std::mutex> lock(flusher_mutex_);
+        periodic_flusher_.reset();
+    }
+
+    drop_all();
+
+    {
+        std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
+        tp_.reset();
+    }
+}
+
+SPDLOG_INLINE std::recursive_mutex &registry::tp_mutex() { return tp_mutex_; }
+
+SPDLOG_INLINE void registry::set_automatic_registration(bool automatic_registration) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    automatic_registration_ = automatic_registration;
+}
+
+SPDLOG_INLINE void registry::set_levels(log_levels levels, level::level_enum *global_level) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    log_levels_ = std::move(levels);
+    auto global_level_requested = global_level != nullptr;
+    global_log_level_ = global_level_requested ? *global_level : global_log_level_;
+
+    for (auto &logger : loggers_) {
+        auto logger_entry = log_levels_.find(logger.first);
+        if (logger_entry != log_levels_.end()) {
+            logger.second->set_level(logger_entry->second);
+        } else if (global_level_requested) {
+            logger.second->set_level(*global_level);
+        }
+    }
+}
+
+SPDLOG_INLINE registry &registry::instance() {
+    static registry s_instance;
+    return s_instance;
+}
+
+SPDLOG_INLINE void registry::apply_logger_env_levels(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    auto it = log_levels_.find(new_logger->name());
+    auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
+    new_logger->set_level(new_level);
+}
+
+SPDLOG_INLINE void registry::throw_if_exists_(const std::string &logger_name) {
+    if (loggers_.find(logger_name) != loggers_.end()) {
+        throw_spdlog_ex("logger with name '" + logger_name + "' already exists");
+    }
+}
+
+SPDLOG_INLINE void registry::register_logger_(std::shared_ptr<logger> new_logger) {
+    auto &logger_name = new_logger->name();
+    throw_if_exists_(logger_name);
+    loggers_[logger_name] = std::move(new_logger);
+}
+
+SPDLOG_INLINE void registry::register_or_replace_(std::shared_ptr<logger> new_logger) {
+    loggers_[new_logger->name()] = std::move(new_logger);
+}
+
+}  // namespace details
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/registry.h
+++ b/csrc/idle_offload/include/spdlog/details/registry.h
@@ -0,0 +1,131 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+// Loggers registry of unique name->logger pointer
+// An attempt to create a logger with an already existing name will result with spdlog_ex exception.
+// If user requests a non existing logger, nullptr will be returned
+// This class is thread safe
+
+#include <spdlog/common.h>
+#include <spdlog/details/periodic_worker.h>
+
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+namespace spdlog {
+class logger;
+
+namespace details {
+class thread_pool;
+
+class SPDLOG_API registry {
+public:
+    using log_levels = std::unordered_map<std::string, level::level_enum>;
+    registry(const registry &) = delete;
+    registry &operator=(const registry &) = delete;
+
+    void register_logger(std::shared_ptr<logger> new_logger);
+    void register_or_replace(std::shared_ptr<logger> new_logger);
+    void initialize_logger(std::shared_ptr<logger> new_logger);
+    std::shared_ptr<logger> get(const std::string &logger_name);
+    std::shared_ptr<logger> default_logger();
+
+    // Return raw ptr to the default logger.
+    // To be used directly by the spdlog default api (e.g. spdlog::info)
+    // This make the default API faster, but cannot be used concurrently with set_default_logger().
+    // e.g do not call set_default_logger() from one thread while calling spdlog::info() from
+    // another.
+    logger *get_default_raw();
+
+    // set default logger and add it to the registry if not registered already.
+    // default logger is stored in default_logger_ (for faster retrieval) and in the loggers_ map.
+    // Note: Make sure to unregister it when no longer needed or before calling again with a new
+    // logger.
+    void set_default_logger(std::shared_ptr<logger> new_default_logger);
+
+    void set_tp(std::shared_ptr<thread_pool> tp);
+
+    std::shared_ptr<thread_pool> get_tp();
+
+    // Set global formatter. Each sink in each logger will get a clone of this object
+    void set_formatter(std::unique_ptr<formatter> formatter);
+
+    void enable_backtrace(size_t n_messages);
+
+    void disable_backtrace();
+
+    void set_level(level::level_enum log_level);
+
+    void flush_on(level::level_enum log_level);
+
+    template <typename Rep, typename Period>
+    void flush_every(std::chrono::duration<Rep, Period> interval) {
+        std::lock_guard<std::mutex> lock(flusher_mutex_);
+        auto clbk = [this]() { this->flush_all(); };
+        periodic_flusher_ = details::make_unique<periodic_worker>(clbk, interval);
+    }
+
+    std::unique_ptr<periodic_worker> &get_flusher() {
+        std::lock_guard<std::mutex> lock(flusher_mutex_);
+        return periodic_flusher_;
+    }
+
+    void set_error_handler(err_handler handler);
+
+    void apply_all(const std::function<void(const std::shared_ptr<logger>)> &fun);
+
+    void flush_all();
+
+    void drop(const std::string &logger_name);
+
+    void drop_all();
+
+    // clean all resources and threads started by the registry
+    void shutdown();
+
+    std::recursive_mutex &tp_mutex();
+
+    void set_automatic_registration(bool automatic_registration);
+
+    // set levels for all existing/future loggers. global_level can be null if should not set.
+    void set_levels(log_levels levels, level::level_enum *global_level);
+
+    static registry &instance();
+
+    void apply_logger_env_levels(std::shared_ptr<logger> new_logger);
+
+private:
+    registry();
+    ~registry();
+
+    void throw_if_exists_(const std::string &logger_name);
+    void register_logger_(std::shared_ptr<logger> new_logger);
+    void register_or_replace_(std::shared_ptr<logger> new_logger);
+    bool set_level_from_cfg_(logger *logger);
+    std::mutex logger_map_mutex_, flusher_mutex_;
+    std::recursive_mutex tp_mutex_;
+    std::unordered_map<std::string, std::shared_ptr<logger>> loggers_;
+    log_levels log_levels_;
+    std::unique_ptr<formatter> formatter_;
+    spdlog::level::level_enum global_log_level_ = level::info;
+    level::level_enum flush_level_ = level::off;
+    err_handler err_handler_;
+    std::shared_ptr<thread_pool> tp_;
+    std::unique_ptr<periodic_worker> periodic_flusher_;
+    std::shared_ptr<logger> default_logger_;
+    bool automatic_registration_ = true;
+    size_t backtrace_n_messages_ = 0;
+};
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+#include "registry-inl.h"
+#endif
--- a/csrc/idle_offload/include/spdlog/details/synchronous_factory.h
+++ b/csrc/idle_offload/include/spdlog/details/synchronous_factory.h
@@ -0,0 +1,22 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include "registry.h"
+
+namespace spdlog {
+
+// Default logger factory-  creates synchronous loggers
+class logger;
+
+struct synchronous_factory {
+    template <typename Sink, typename... SinkArgs>
+    static std::shared_ptr<spdlog::logger> create(std::string logger_name, SinkArgs &&...args) {
+        auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
+        auto new_logger = std::make_shared<spdlog::logger>(std::move(logger_name), std::move(sink));
+        details::registry::instance().initialize_logger(new_logger);
+        return new_logger;
+    }
+};
+}  // namespace spdlog
--- a/csrc/idle_offload/include/spdlog/details/tcp_client-windows.h
+++ b/csrc/idle_offload/include/spdlog/details/tcp_client-windows.h
@@ -0,0 +1,217 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#define WIN32_LEAN_AND_MEAN
+// tcp client helper
+#include <spdlog/common.h>
+#include <spdlog/details/os.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <windows.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+
+#pragma comment(lib, "Ws2_32.lib")
+#pragma comment(lib, "Mswsock.lib")
+#pragma comment(lib, "AdvApi32.lib")
+
+namespace spdlog {
+namespace details {
+class tcp_client {
+    SOCKET socket_ = INVALID_SOCKET;
+
+    static void init_winsock_() {
+        WSADATA wsaData;
+        auto rv = WSAStartup(MAKEWORD(2, 2), &wsaData);
+        if (rv != 0) {
+            throw_winsock_error_("WSAStartup failed", ::WSAGetLastError());
+        }
+    }
+
+    static void throw_winsock_error_(const std::string &msg, int last_error) {
+        char buf[512];
+        ::FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL,
+                         last_error, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf,
+                         (sizeof(buf) / sizeof(char)), NULL);
+
+        throw_spdlog_ex(fmt_lib::format("tcp_sink - {}: {}", msg, buf));
+    }
+
+public:
+    tcp_client() { init_winsock_(); }
+
+    ~tcp_client() {
+        close();
+        ::WSACleanup();
+    }
+
+    bool is_connected() const { return socket_ != INVALID_SOCKET; }
+
+    void close() {
+        ::closesocket(socket_);
+        socket_ = INVALID_SOCKET;
+    }
+
+    SOCKET fd() const { return socket_; }
+
+    int connect_socket_with_timeout(SOCKET sockfd,
+                                    const struct sockaddr *addr,
+                                    int addrlen,
+                                    const timeval &tv) {
+        // If no timeout requested, do a normal blocking connect.
+        if (tv.tv_sec == 0 && tv.tv_usec == 0) {
+            int rv = ::connect(sockfd, addr, addrlen);
+            if (rv == SOCKET_ERROR && WSAGetLastError() == WSAEISCONN) {
+                return 0;
+            }
+            return rv;
+        }
+
+        // Switch to non‐blocking mode
+        u_long mode = 1UL;
+        if (::ioctlsocket(sockfd, FIONBIO, &mode) == SOCKET_ERROR) {
+            return SOCKET_ERROR;
+        }
+
+        int rv = ::connect(sockfd, addr, addrlen);
+        int last_error = WSAGetLastError();
+        if (rv == 0 || last_error == WSAEISCONN) {
+            mode = 0UL;
+            if (::ioctlsocket(sockfd, FIONBIO, &mode) == SOCKET_ERROR) {
+                return SOCKET_ERROR;
+            }
+            return 0;
+        }
+        if (last_error != WSAEWOULDBLOCK) {
+            // Real error
+            mode = 0UL;
+            if (::ioctlsocket(sockfd, FIONBIO, &mode)) {
+                return SOCKET_ERROR;
+            }
+            return SOCKET_ERROR;
+        }
+
+        // Wait until socket is writable or timeout expires
+        fd_set wfds;
+        FD_ZERO(&wfds);
+        FD_SET(sockfd, &wfds);
+
+        rv = ::select(0, nullptr, &wfds, nullptr, const_cast<timeval *>(&tv));
+
+        // Restore blocking mode regardless of select result
+        mode = 0UL;
+        if (::ioctlsocket(sockfd, FIONBIO, &mode) == SOCKET_ERROR) {
+            return SOCKET_ERROR;
+        }
+
+        if (rv == 0) {
+            WSASetLastError(WSAETIMEDOUT);
+            return SOCKET_ERROR;
+        }
+        if (rv == SOCKET_ERROR) {
+            return SOCKET_ERROR;
+        }
+
+        int so_error = 0;
+        int len = sizeof(so_error);
+        if (::getsockopt(sockfd, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&so_error), &len) ==
+            SOCKET_ERROR) {
+            return SOCKET_ERROR;
+        }
+        if (so_error != 0 && so_error != WSAEISCONN) {
+            // connection failed
+            WSASetLastError(so_error);
+            return SOCKET_ERROR;
+        }
+
+        return 0;  // success
+    }
+
+    // try to connect or throw on failure
+    void connect(const std::string &host, int port, int timeout_ms = 0) {
+        if (is_connected()) {
+            close();
+        }
+        struct addrinfo hints {};
+        ZeroMemory(&hints, sizeof(hints));
+
+        hints.ai_family = AF_UNSPEC;      // To work with IPv4, IPv6, and so on
+        hints.ai_socktype = SOCK_STREAM;  // TCP
+        hints.ai_flags = AI_NUMERICSERV;  // port passed as as numeric value
+        hints.ai_protocol = 0;
+
+        timeval tv;
+        tv.tv_sec = timeout_ms / 1000;
+        tv.tv_usec = (timeout_ms % 1000) * 1000;
+
+        auto port_str = std::to_string(port);
+        struct addrinfo *addrinfo_result;
+        auto rv = ::getaddrinfo(host.c_str(), port_str.c_str(), &hints, &addrinfo_result);
+        int last_error = 0;
+        if (rv != 0) {
+            last_error = ::WSAGetLastError();
+            WSACleanup();
+            throw_winsock_error_("getaddrinfo failed", last_error);
+        }
+
+        // Try each address until we successfully connect(2).
+        for (auto *rp = addrinfo_result; rp != nullptr; rp = rp->ai_next) {
+            socket_ = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
+            if (socket_ == INVALID_SOCKET) {
+                last_error = ::WSAGetLastError();
+                WSACleanup();
+                continue;
+            }
+            if (connect_socket_with_timeout(socket_, rp->ai_addr, (int)rp->ai_addrlen, tv) == 0) {
+                last_error = 0;
+                break;
+            }
+            last_error = WSAGetLastError();
+            ::closesocket(socket_);
+            socket_ = INVALID_SOCKET;
+        }
+        ::freeaddrinfo(addrinfo_result);
+        if (socket_ == INVALID_SOCKET) {
+            WSACleanup();
+            throw_winsock_error_("connect failed", last_error);
+        }
+        if (timeout_ms > 0) {
+            DWORD tv = static_cast<DWORD>(timeout_ms);
+            ::setsockopt(socket_, SOL_SOCKET, SO_RCVTIMEO, (const char *)&tv, sizeof(tv));
+            ::setsockopt(socket_, SOL_SOCKET, SO_SNDTIMEO, (const char *)&tv, sizeof(tv));
+        }
+
+        // set TCP_NODELAY
+        int enable_flag = 1;
+        ::setsockopt(socket_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&enable_flag),
+                     sizeof(enable_flag));
+    }
+
+    // Send exactly n_bytes of the given data.
+    // On error close the connection and throw.
+    void send(const char *data, size_t n_bytes) {
+        size_t bytes_sent = 0;
+        while (bytes_sent < n_bytes) {
+            const int send_flags = 0;
+            auto write_result =
+                ::send(socket_, data + bytes_sent, (int)(n_bytes - bytes_sent), send_flags);
+            if (write_result == SOCKET_ERROR) {
+                int last_error = ::WSAGetLastError();
+                close();
+                throw_winsock_error_("send failed", last_error);
+            }
+
+            if (write_result == 0)  // (probably should not happen but in any case..)
+            {
+                break;
+            }
+            bytes_sent += static_cast<size_t>(write_result);
+        }
+    }
+};
+}  // namespace details
+}  // namespace spdlog
--- a/Show More
+++ b/Show More