Compare commits

...

131 Commits

Author SHA1 Message Date
starkwj
cf24a5846f add Dockerfile and readme 2026-01-05 09:32:26 +00:00
starkwj
135cc0a505 vllm-ascend vnpu v1 2025-12-26 07:37:35 +00:00
zhangyiming
2f1aed98cc [Doc] Update version policy to the latest. (#5071)
### What this PR does / why we need it?
[Doc] Update version policy to the latest.

Signed-off-by: menogrey <1299267905@qq.com>
2025-12-16 15:24:46 +08:00
zzzzwwjj
8c41770f1f [bugfix] fix fp32 trans nz (#5068)
### What this PR does / why we need it?
fix fp32 trans nz error, disable fp32 dtype trans nz.

Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-12-16 15:04:31 +08:00
wangxiyuan
11e6d6c291 [doc] update developer guide (#5060)
Update developer doc for v0.11.0-dev. This PR mainly picks developer doc
from main to v0.11.0-dev. All related Feature work with 0.11.0 already.

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-16 14:09:52 +08:00
zhangyiming
e07abfaa75 [Doc] Add new contributors. (#5066)
### What this PR does / why we need it?
[Doc] Add new contributors.

Signed-off-by: menogrey <1299267905@qq.com>
2025-12-16 12:47:40 +08:00
zhangxinyuehfad
ca0823f238 [0.11.0][Bugfix] fix fastapi version (#5052)
### What this PR does / why we need it?
fix fastapi version

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-12-16 11:34:11 +08:00
Shanshan Shen
303c08aec9 [Doc] Update structured output doc with upstream link (#5058)
### What this PR does / why we need it?

Cherry-pick from main
https://github.com/vllm-project/vllm-ascend/pull/4015.

Currently, the usage of structured output feature in vllm-ascend is
totally the same as that in vllm.

Thus, IMO, it's better to remove this doc directly to avoid some case
that there are some changes in the upstream doc and we don't update our
doc in time, which can be misleading to users.

Signed-off-by: shen-shanshan <467638484@qq.com>
2025-12-16 11:32:53 +08:00
Clorist33
2b5b309133 [Bugfix]Fix precision issues in moe_mlp (vllm-ascend v0.11.0-dev) (#5023)
### What this PR does / why we need it?
Use group_list[0] to replace group_diff[0] in function
"cumsum_group_list" (moe_mlp.py).
The purpose is to modify it to the correct logic of converting cumsum to
count.

### Does this PR introduce _any_ user-facing change?
No

Signed-off-by: tanqingshan (A)  <50050625@china.huawei.com>
Co-authored-by: tanqingshan (A) <50050625@china.huawei.com>
2025-12-16 08:40:03 +08:00
zhangxinyuehfad
87c0cfafa3 [0.11.0][Bugfix] fix fastapi version (#5048)
### What this PR does / why we need it?
fix fastapi version <0.124.0

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-12-15 23:51:38 +08:00
wangxiyuan
01a13a9b77 fix nz for quantization (#4943)
quantization ops rely on NZ by force, we should remove the nz check for it.

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-12 14:54:41 +08:00
sunchendd
5932abc446 [Bugfix] Fix the Eagle3 inference failure issue. (#4721)
### What this PR does / why we need it?
Fix the Eagle3 inference failure issue.
error message: "EngineCore encountered an issue. See stack trace (above)
for the root cause."

Fixes https://github.com/vllm-project/vllm-ascend/issues/4323

### How was this patch tested?
`vllm serve /nfs/1_AscendPackage/05_weights_public/Qwen3-32B \
--served-model-name Qwen3-32B \ -tp 4 \ --host "0.0.0.0" \ --port "8000"
\ --trust-remote-code \ --speculative-config
'{"method":"eagle3","model":"/home/scd/qwen3_32b_eagle3/","num_speculative_tokens":4,"draft_tensor_parallel_size":1}'
\ --max-num-batched-tokens 4096 \ --max-model-len 4096`

```
curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen3-32B",
        "prompt": "hi, where is the capital of France?",
        "max_tokens": 10,
        "temperature": 0
    }' | python3 -m json.tool
```

vLLM version: v0.11.0
vLLM-ascend version: v0.11.0rc2

Signed-off-by: 17764591921 <sunchend@outlook.com>
2025-12-12 14:52:29 +08:00
Clorist33
4f0dddc9ee [Bugfix] bugfix for moe_mlp in vllm-ascend/v0.11.0-dev (#4885)
### What this PR does / why we need it?
This PR fixes a bug in the moe_mlp module by correcting the arguments
passed to the torch_npu.npu_dequant_swiglu_quant function.It properly
converts group_list from a cumulative sum to counts for the group_index
parameter.

### Does this PR introduce _any_ user-facing change?
No


- vLLM version: v0.12.0
- vLLM main: https://github.com/vllm-project/vllm/main

---------

Signed-off-by: tanqingshan (A)  <50050625@china.huawei.com>
Signed-off-by: tanqingshan (A) <50050625@china.huawei.com>
Co-authored-by: tanqingshan (A) <50050625@china.huawei.com>
Co-authored-by: Mercykid-bash <ruanche0218@gmail.com>
2025-12-12 14:51:47 +08:00
Slightwind
9c0ad46c1a [0.11.0][Bugfix] Remove the ZMQ communication setup on the D node (#4916)
In the PD separation scenario, the D node does not need to perform get
operations, and therefore does not need to create ZeroMQ (ZMQ)
communication.
---------

Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2025-12-12 14:37:49 +08:00
1092626063
ceadc2788d Revert "[refactor]support gatingtopk operator generalization (#4356)" (#4873)
This reverts commit c4a11a745a.

ops npu_gating_top_k caused Qwen3-30B precision problem, so revert it.

Signed-off-by: 1092626063 <1092626063@qq.com>
2025-12-10 15:45:20 +08:00
linfeng-yuan
9a144bc7be [Docs][0.11.0] delete AIV env variables in DSV32 documentation (#4833)
### What this PR does / why we need it?
Delete wrong configuration in deepseek v3.2 documentation.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
NA.

Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-12-09 15:53:53 +08:00
Mercykid-bash
8f45f9ce29 BugFix: Resolve shape mismatch in eplb update and calculation issues in quant_apply_mlp (#4777)
## Description
This PR addresses two key issues in the MoE module when redundant
experts are enabled, and fixes a calculation precision bug in the
forward inference of quantized MLP:

### 1. Shape Mismatch in EPLB Expert Map Update
- **Root Cause**: 
When redundant experts are turned on, a shape inconsistency occurs
during the expert map update in `Vllm_apaptor`:
- The shape of `self.expert_map_per_layer[layer_id]` is
`[num_physical_experts,]` (aligned with physical expert count).
- The shape of `updated_expert_map` is `[num_logical_experts,]` (aligned
with logical expert count).
- Indices in `self.expert_map_per_layer[layer_id]` that exceed the
logical expert count cannot be properly mapped, leading to tensor shape
mismatch errors.
- The same shape mismatch exists in the `log2phy` map update (between
`self.log2phy_map_per_layer[layer_id]` and `updated_log2phy_map`).

- **Fix**:
- Fix the shape initialization of `expert_map_per_layer` and
`log2phy_map_per_layer` to be consistently set to
`[num_physical_experts,]` across the module lifecycle.
- Align the shape of `updated_expert_map` and `updated_log2phy_map` with
the pre-initialized physical-expert-sized tensors during update
operations, ensuring shape consistency for index mapping.

### 2. Calculation Precision Issue in Quantized MoE MLP Forward
Inference
- **Root Cause**:
In the forward pass of `moe_mlp`, the
`torch_npu.npu_dequant_swiglu_quant` operator only accepts group lists
in **Count format** as input. However, the group list provided by
`quant_apply_mlp` was in **Cumsum format**, which caused operator input
format mismatch and degraded calculation precision.

- **Fix**:
- Convert the cumsum-formatted group list from `quant_apply_mlp` to
Count format before passing it to `torch_npu.npu_dequant_swiglu_quant`.
- Ensure the input format of the dequantization operator meets its
requirements, restoring the expected calculation precision for quantized
MoE MLP layers.

## Impact
- Resolves shape mismatch errors in EPLB expert/log2phy map updates when
redundant experts are enabled, ensuring stable expert routing.
- Fixes quantized MoE MLP forward precision issues on NPU, aligning
operator input formats with NPU kernel requirements.
- No breaking changes to existing interfaces; the fixes are
backward-compatible for scenarios without redundant experts enabled.

---------

Signed-off-by: Che Ruan <cr623@ic.ac.uk>
Signed-off-by: Mercykid-bash <ruanche0218@gmail.com>
Co-authored-by: Che Ruan <cr623@ic.ac.uk>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-09 15:46:58 +08:00
linfeng-yuan
695e5c9ebc [0.11.0][ops] npu_top_k_top_p supports k and p only (#4153)
### What this PR does / why we need it?
With CANN 8.3 and corresponding PTA 2.7.1, `npu_top_k_top_p` supports
passing only k (1<=k<=1024) and p separately.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
E2E performance test with only `top_k` and `p` seperately. This pr gains
0.2ms improvements in TPOT with `batch_size=16`.

Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-12-09 15:45:40 +08:00
Li Wang
4588d1f215 [CI] Use arm node for unit tests (#4819)
### What this PR does / why we need it?
Use arm node for unit tests

Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-09 15:45:14 +08:00
linfeng-yuan
e0757dc376 [0.11.0]fix the configuration conflicts in documentation (#4824)
### What this PR does / why we need it?
Fix configuration errors in our documentation.

### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
NA.

Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-12-09 15:37:06 +08:00
zhangxinyuehfad
033e3557cc [cherry-pick]fix qwen3vl mrope op (#4484) (#4811)
### What this PR does / why we need it?
Qwen2.5-VL mrope precision problem would been solved once this pr is
merged
### Does this PR introduce _any_ user-facing change? No
### How was this patch tested?
Test on G8600 with textVQA dataset

- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: shaopeng-666 <lishaopeng21@huawei.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-09 11:07:32 +08:00
Levi
9862a23985 【0.11.0-dev】optimization of kimi-k2 in cann8.3 (#4555)
### What this PR does / why we need it?
In cann8.3, npu_moe_gating_top_k operator can support expert nums with
384, so kimi can use the operator to get better preformance.
---------

Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2025-12-09 08:49:15 +08:00
zhangxinyuehfad
0d094531b4 [bugfix] Fixed the bug in retrieving the quantization method for mlp.… (#4797)
When retrieving the quantization method for MOE (e.g., the quantization
file of DeepSeek v3.2 exp do not match the model's naming convention in
eager mode), a KeyError is raised: "model.layers.3.mlp.experts.weight
not in self.quant_description". However the quantization file is like :
```bash
  "model.layers.3.mlp.experts.255.gate_proj.weight": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.gate_proj.weight_scale": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.gate_proj.weight_offset": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.down_proj.weight": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.down_proj.weight_scale": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.down_proj.weight_offset": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.up_proj.weight": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.up_proj.weight_scale": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.up_proj.weight_offset": "W8A8_DYNAMIC",
```

Co-Authored-By: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com>

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com>
2025-12-09 08:47:19 +08:00
Levi
4e728f1f40 [Bugfix] fix qwen3-vl-moe shape ERROR during the _prepare_inputs phase under high concurrency. (#4658)
### What this PR does / why we need it?
Earlier we fixed a similar issue for qwen2.5-vl 【
https://github.com/vllm-project/vllm-ascend/issues/4430 】, and then the
multimodal models in vllm v0.11.0 should all have this problem. Here, we
have specifically proposed a fix for qwen3-vl-moe.

---------

Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2025-12-08 19:30:16 +08:00
Wang Yixuan
d412565ec9 [Cherry-pick]bmm_transpose to v011dev (#3995)
### What this PR does / why we need it?
Add a custom op to acclerater the deepseek model. The fusion ops combine
the bmm and transpose together, which is applied to mla module.
Cherry-pick from this commtid c68ddc11ce

### Does this PR introduce _any_ user-facing change?
No

---------

Signed-off-by: hust17yixuan <303660421@qq.com>
2025-12-08 19:22:14 +08:00
Angazenn
6391f0625f [v0.11.0-dev][bugfix] Add branch for stream up-lifting in update_attn_params (#4437)
### What this PR does / why we need it?
#3985 move stream context initialization before for-loops to improve
performance. However, we find that this might cause potential accuracy
drop when used with pd disaggregation. Thus we partly revert this change
when using pd disaggregation, and we shall fix this bug in th future.

### Does this PR introduce _any_ user-facing change?
No.


---------

Signed-off-by: Angazenn <supperccell@163.com>
2025-12-08 08:54:46 +08:00
Li Wang
2598124e67 [Image] Correcting the vllm tag of the openeuler image on the A2 device. (#4745)
### What this PR does / why we need it?
Corrected the vllm tag, which should have been in v0.11.0


Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-06 10:55:22 +08:00
offline893
350999c4ef [Bugfix]Fix eplb enable when using mtp float weights. (#4576)
### What this PR does / why we need it?
Fix eplb enable when using mtp float weights. It will be remove when
eplb supporting mtp and float weights.

### How was this patch tested?
Deepseek-V3 + MTP + EPLB in A3.
---------

Signed-off-by: offline0806 <3337230449@qq.com>
Signed-off-by: offline893 <158537145+offline893@users.noreply.github.com>
Co-authored-by: offline0806 <3337230449@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-05 21:15:32 +08:00
1092626063
c4a11a745a [refactor]support gatingtopk operator generalization (#4356)
### What this PR does / why we need it?
This pr is cherry-pick from :
https://github.com/vllm-project/vllm-ascend/pull/2958 and
https://github.com/vllm-project/vllm-ascend/pull/4340

Past:
npu_moe_gating_top_k can only support 'group_count=256' pattern

Now:
1、npu_moe_gating_top_k support all size of group_count
2、the functionality of `torch_npu.npu_moe_gating_top_k_softmax` are
included in `torch_npu.npu_moe_gating_top_k`

CANN: depends on 8.3.RC1

Performance:
1. GLM4.5-w8a8, TPS improve 6%
2. Qwen3, the same as before

---------

Signed-off-by: 1092626063 <1092626063@qq.com>
2025-12-04 20:10:13 +08:00
LI SHENGYONG
593a96056c 【EPLB】Eplb Redundant Experts Bugfix (#4232)
### What this PR does / why we need it?
Redundant experts bugfix
The calculation logic for redundant experts has been fixed, allowing the
correct number of redundant experts to be calculated using the map.
Therefore, there is no longer a need to set the redundant expert
parameter when passing the map.

### Does this PR introduce _any_ user-facing change?
After configuring the path for experts_map, users do not need to
configure iinit_redundancy_expert.

### How was this patch tested?
The accuracy of EPLB was tested with and without the use of redundant
experts.

---------

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-12-03 12:00:05 +08:00
Mengqing Cao
b6d63bbd52 [v0.11.0-dev][CI] Fix ngram lacking of input arg dummy_compute_logits error (#4648)
### What this PR does / why we need it?
Fix ngram lacking of input arg `dummy_compute_logits` error

### How was this patch tested?
CI passed with existing test.

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
2025-12-03 09:22:07 +08:00
Levi
865f1f7fc8 [Bugfix] Resolve the interface compatibility issue of get_input_embeddings in MM (#4638)
### What this PR does / why we need it?
Resolve the interface compatibility issue of get_input_embeddings in MM,
because the get_input_embeddings func of other model does not have the
is_multimodal parameter

---------

Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2025-12-02 22:21:47 +08:00
Levi
3b4cb23616 [Bugfix] fix qwen2.5-vl-72b shape ERROR during the _prepare_inputs phase under high concurrency. (#4553)
### What this PR does / why we need it?
qwen2.5-vl-72b reports a shape ERROR during the _prepare_inputs phase
under high concurrency【 issue
https://github.com/vllm-project/vllm-ascend/issues/4430 】

This PR fix it.

The related PR in main branch
:https://github.com/vllm-project/vllm-ascend/pull/3612

The related commit in vllm :
17c540a993/vllm/model_executor/models/interfaces.py

【The _get_text_embeddings function has been refactored to
interfaces.pyin vllm.】

Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2025-12-02 14:20:45 +08:00
Zetong Li
52abd47f8c [Bugfix][SHM] Use writer lock by default and remove redundant env (#4117)
### What this PR does / why we need it?
This PR aims to remove env introduced by #3988 and use lock by default.
As described in https://github.com/vllm-project/vllm/issues/27858, we
have tested the writer lock method in various scenarios and the
performance is almost unaffected. Therefore, we believe that it would be
safe to enable the lock by default and remove the redundant env
`SHM_BARRIER` now.

After discussion, we decide to preserve env and set it as true by
default.

### Does this PR introduce _any_ user-facing change?
`SHM_BARRIER` is set as true by default.

### How was this patch tested?
by ci

---------

Signed-off-by: Zetong Li <slippersss@126.com>
2025-12-01 22:27:01 +08:00
Li Wang
76d0ba4342 [Image][Build] Cherry pick #4062 from main (#4506)
### What this PR does / why we need it?
This patch aims to integrate the mooncake
[v0.3.7.2.post2](https://github.com/kvcache-ai/Mooncake/releases/tag/v0.3.7.post2)
to vllm-ascend images

Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-01 11:39:40 +08:00
zouyida2052
2b4f7a5016 [cherry-pick pr-4254] bugfix for mtp>1 when lm_head_tp>1 (#4360)
### What this PR does / why we need it?
Previously, the dummy run executed compute_logits only once, regardless
of num_speculative_tokens. This caused execute_model to hang on
compute_logits when lm head tensor parallelism exceeded 1. The fix
ensures compute_logits executes correctly during dummy run, matching
num_speculative_tokens.

Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
2025-12-01 11:11:15 +08:00
LI SHENGYONG
cd9f5c0611 [bugfix] dep ineffective (#4416)
### What this PR does / why we need it?
The expert mapping table and weights of the dynamic EPLB were not
updated, causing the accuracy to be correct but not effective. This bug
has now been fixed.

If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-11-29 15:19:11 +08:00
henryxuxu0716
71acc8ddeb For nz unset in bf16&fp16 (#4495)
<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
disable NZ for float weight case. This is only a quick fix for dev
branch.

For main branch, we'll consider more case to make it more common.


### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->

### How was this patch tested?
qwen2.5 32B
<img width="441" height="221" alt="image"
src="https://github.com/user-attachments/assets/7ae18ffd-1ce2-43d9-9960-be45250ad0da"
/>

---------

Signed-off-by: 刘哲续 <liuzhexu1@huawei.com>
Co-authored-by: 刘哲续 <liuzhexu1@huawei.com>
2025-11-28 17:32:25 +08:00
Zhu Yi Lin
96c362361e [0.11.0][TEST] Delete Comment (#4428)
### What this PR does / why we need it?
delete chinese comment
pick from https://github.com/vllm-project/vllm-ascend/pull/4427

### Does this PR introduce _any_ user-facing change?
no

Signed-off-by: GDzhu01 <809721801@qq.com>
2025-11-25 21:39:36 +08:00
zhangxinyuehfad
a686f2962a [0.11.0][Bugfix] fix e2e full test (#4424)
### What this PR does / why we need it?
pin Transformer version to 4.57.1 fix 'dict' object has no attribute
'model_type'

https://github.com/vllm-project/vllm-ascend/actions/runs/19660859460/job/56306822464

picked from https://github.com/vllm-project/vllm-ascend/pull/4423


Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-11-25 21:21:42 +08:00
Shanshan Shen
cdaf7f4a51 [MM][Bugfix] Minor fix for VL model verification (#4385)
### What this PR does / why we need it?

To fix ops test, where `model_config` has been set to `None` and doesn't
has `hf_config` attribute, we have added a check for `model_config` to
guarantee it is not `None_Type`.

cherry-pick from main:
https://github.com/vllm-project/vllm-ascend/pull/4384.


Signed-off-by: shen-shanshan <467638484@qq.com>
2025-11-25 20:36:32 +08:00
wujinyuan1
386a85eccc [Bugfix]Fix the hang issue of multimodal model when running with DP>1 (#4393)
### What this PR does / why we need it?
When cudagraph_mode is set to FULL_DECODE_ONLY, if dp > 1, the dummy-run
process will be triggered. When calling the update_attn_params function,
the num_tokens parameter needs to be passed, and this value is obtained
through positions.shape[0]. However, the multimodal model uses mRope
(multi-dimensional rotary positional embeddings), which causes the shape
of positions to be 2. As a result, the value obtained from
positions.shape[0] is incorrect. We solve this problem by replacing
positions.shape[0] with num_tokens.

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
vLLM version: v0.11.0rc3
vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: wujinyuan1 <wjy9595@qq.com>
Co-authored-by: wujinyuan1 <wjy9595@qq.com>
2025-11-25 09:32:22 +08:00
weichen
a3164ac372 [v0.11.0][Bugfix][MoE] enable force_load_balance in aclgraph (#4367)
### What this PR does / why we need it?
Enable force_load_balance in aclgraph, solving OOM issues.
pick from https://github.com/vllm-project/vllm-ascend/pull/4366
### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
e2e & ut

Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
2025-11-25 09:16:57 +08:00
mazhixin000
75452abe1e [Doc][v11.0-dev][cherry-pick]Add single node PD disaggregation instructions (#4370)
### What this PR does / why we need it?

add single node PD disaggregation instructions for Qwen 2.5VL model.


### Does this PR introduce _any_ user-facing change?
no


---------

Signed-off-by: mazhixin <mazhixin7@huawei.com>
Signed-off-by: mazhixin000 <mazhixinkorea@163.com>
Co-authored-by: mazhixin <mazhixin7@huawei.com>
2025-11-24 17:23:11 +08:00
wangxiyuan
a2e4c3fe78 Revert "[cherry-pick][refactor]support gatingtopk operator generalization (#4050)" (#4352)
This reverts commit c87a77e8b4.

it breaks ops e2e test

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-21 23:03:20 +08:00
SILONG ZENG
5ad0ccdc31 [v0.11.0]Upgrade cann to 8.3.rc2 (#4332)
### What this PR does / why we need it?
Upgrade CANN to 8.3.rc2

Signed-off-by: MrZ20 <2609716663@qq.com>
2025-11-21 22:48:57 +08:00
LI SHENGYONG
0f9025cceb [EPLB] Eplb Verify Fix (#4334)
### What this PR does / why we need it?
Eplb Verify Fix
---------

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Signed-off-by: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-11-21 18:18:15 +08:00
Ting FU
97ffb9120f [CI] Defaultly compile vllm with multimodal audio feature in dockerfile (#4324) (#4341)
### What this PR does / why we need it?
For better usability, add multimodal audio to vllm compiling in
dockerfile defaultly.

Image size will increase only 2.xM.

Signed-off-by: Ting FU <futing10@huawei.com>
2025-11-21 17:53:00 +08:00
Li Wang
218bc70f6f [CI] Remove redundant workflows (#4335)
### What this PR does / why we need it?
Remove redundant workflows, just maintain a separate workflow which
setting up on the main branch to control the execution of each branch,
instead of running each branch simultaneously, thus reducing resource
waste.


Signed-off-by: wangli <wangli858794774@gmail.com>
2025-11-21 16:48:35 +08:00
Shanshan Shen
70f076331f [MM][Bugfix] Add error log for VL models when enabling FLASHCOMM (#4222)
### What this PR does / why we need it?

Add error log for VL models when enabling
`VLLM_ASCEND_ENABLE_FLASHCOMM1=1` or `VLLM_ASCEND_ENABLE_FLASHCOMM=1`
(for backward compatibility).

This is a temporary fix for
https://github.com/vllm-project/vllm-ascend/issues/4132.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

Signed-off-by: shen-shanshan <467638484@qq.com>
2025-11-21 15:04:35 +08:00
LI SHENGYONG
c94b38c82e [Readme] EPLB Support Scenarios (#4315)
### What this PR does / why we need it?
Add information on the scope of EPLB support.

---------

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-11-21 14:25:39 +08:00
Angazenn
9c6d0b422c [v0.11.0-dev][misc]change default capture size for Qwen3-MoE when using full dp (#4205)
### What this PR does / why we need it?
This dev version of #4199 .
Currently, the default `cudagraph_capture_size` in vLLM is `[1, 2, 4 ,8
,16 ,24 ,... , max_capture_size]`. However, this is not always the best
choice on different situations. This PR aims to change the default
setting when running Qwen3-MoE on full dp (`dp_size > 1` && `tp_size ==
1`) setting, which is usually applied in Large-Scale EP.
old :
`[1, 2, 4 ,8 ,16 ,24 ,... , max_capture_size]`
new:
`[1, 2, 5 ,10 ,15, 16 ,24 ,... , max_capture_size]`
This is mainly because the performance of `_npu_paged_attention` op
degrades dramatically on old settings. We hope to provide better
performance if users do not set specific `cudagraph_capture_size`.
### Does this PR introduce _any_ user-facing change?
The default `cudagraph_capture_size` is modified in above cases.
However, if `cudagraph_capture_size` has already set by users, this PR
won't have any influence on this.

### How was this patch tested?

- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

---------

Signed-off-by: Angazenn <supperccell@163.com>
2025-11-21 11:19:11 +08:00
shaopeng-666
b6d59bdea2 cherry pick from pr 4270 (#4285)
### What this PR does / why we need it?
avoid mrope fusion op when running qwen25vl on x86 machine

---------

Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
2025-11-19 22:32:02 +08:00
MengLong Chen
277670730c [Bugfix][Aclgraph] failed to update graph task (#4282)
### What this PR does / why we need it?
bugfix the error of full graph aclgraph


Signed-off-by: chenmenglong <chenmenglong1@huawei.com>
2025-11-19 21:30:48 +08:00
1092626063
c87a77e8b4 [cherry-pick][refactor]support gatingtopk operator generalization (#4050)
### What this PR does / why we need it?
pick from : https://github.com/vllm-project/vllm-ascend/pull/2958
Past:
npu_moe_gating_top_k can only support 'group_count=256' pattern

Now:
1、npu_moe_gating_top_k support all size of group_count
2、the functionality of `torch_npu.npu_moe_gating_top_k_softmax` are
included in `torch_npu.npu_moe_gating_top_k`

CANN: depends on 8.3.RC1

Performance:
1. GLM4.5-w8a8, TPS improve 6%
2. Qwen3, the same as before


Signed-off-by: 1092626063 <1092626063@qq.com>
2025-11-19 10:39:28 +08:00
liziyu
ddf3e75800 [Cherry-pick] [0.11.0] pd proxy support ipv6 and fix proxy (#4242)
### What this PR does / why we need it?
pd proxy support ipv6, mooncake connector check whether the IPv6 address
is used and notify the user.

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
2025-11-18 16:33:00 +08:00
Icey
378e92a2a2 [Cherry-pick][0.11.0] Adapted to torch_npu.npu_fused_infer_attention_score (#4202)
### What this PR does / why we need it?
Fixes a compatible bug with torch_npu.npu_fused_infer_attention_score
which is discribed in
https://github.com/vllm-project/vllm-ascend/issues/4020.
@momo609 tells us this solution.
cherry-pick: https://github.com/vllm-project/vllm-ascend/pull/4025

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: Icey <1790571317@qq.com>
2025-11-17 10:56:23 +08:00
zhangyiming
a7eb42cf0a [v0.11.0-dev][Bugfix][cherry-pick]bugfix for weight load of kimi-k2 (#4190)
### What this PR does / why we need it?
This is cherry-pick from #3798 

Fix kimi-k2 start bug, weight load
ERROR:https://github.com/vllm-project/vllm-ascend/issues/3785

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main:
c9461e05a4

---------

Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: Levi <54832289+Levi-JQ@users.noreply.github.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: zhaozx-cn <zhaozx2116@163.com>
2025-11-14 15:43:22 +08:00
weichen
51e5806d76 [0.11.0-dev][Bugfix][EPLB] Quick fix for missing log2phy conversion (#4150)
### What this PR does / why we need it?
Quick fix for missing log2phy conversion in MC2 token_dispatcher, which
has been already fixed in main branch
https://github.com/vllm-project/vllm-ascend/pull/3512.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
e2e & ut

Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
2025-11-13 14:32:40 +08:00
zhaozx-cn
cd652acb65 [BugFix] Fix kv_no_split not contiguous (#3711)
allgather need contiguous data, split operation return uncontiguous
data.

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: zhaozx-cn <zhaozx2116@163.com>
2025-11-13 11:29:37 +08:00
Angazenn
28a15299ea [cherry-pick][v0.11.0-dev][bugfix] Change seq_lens in dummy attn_metadata to max_query_len (#4099)
<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
This is cherry-pick from #4097 .
Currently, we set `seq_lens` in dummy attn_metadata to be
`max_model_len` to get max workspace for attention during capturing.
However, setting it consistently to be `max_model_len` causing dummy_run
to execute a long attention when running actual inference. For example,
if there is a single req with `seqs_lens` as [8] but `max_model_len` is
131072, the whole process will be slow down by dummy_run as it execute a
fake long-seq attention. Therefore, we instead set it to max_query_len,
which is also consistent with vLLM gpu implementation.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

---------

Signed-off-by: Angazenn <supperccell@163.com>
2025-11-12 20:32:50 +08:00
zhangxinyuehfad
7732a89fd9 [v0.11.0][UT][Fixbug] Fix UT test (#4151)
### What this PR does / why we need it?
Fix UT test
Backport: https://github.com/vllm-project/vllm-ascend/pull/4116

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-11-12 16:55:18 +08:00
zhaomingyu13
650ce8ad19 [0.11.0][Bugfix] Fix ngram precision issue and open e2e ngram test (#4092)
### What this PR does / why we need it?
Fix ngram precision issue and open e2e ngram test
---------

Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com>
Signed-off-by: zhaomingyu13 <zhaomingyu13@h-partners.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-11-11 09:58:03 +08:00
Angazenn
2069bef449 [v0.11.0-dev][bugfix] Fix a bug in wrongly set npu_stream (#4106)
### What this PR does / why we need it?
This pr fixes a bug introduced in #3985, which set wrong npu_stream
(possibly by mistakes in cherry-pick). I correct it and make
`update_attn_params` consistent to main branch.

### Does this PR introduce _any_ user-facing change?
No.

Signed-off-by: Angazenn <supperccell@163.com>
2025-11-11 09:16:41 +08:00
Icey
c5fe179cef [0.11.0] [Cherry-pick #4058] Fixes Qwen3-Next enable nz accuracy problem (#4056)
### What this PR does / why we need it?
- Fixes Qwen3-Next enable nz accuracy problem

---------

Signed-off-by: wxsIcey <1790571317@qq.com>
Signed-off-by: Icey <1790571317@qq.com>
2025-11-10 20:56:39 +08:00
rjg-lyh
ebd45b6596 [V0.11.0][Core] Restore scheduling logic under default configuration (#4094)
### What this PR does / why we need it?
Cherry-pick #3967 from main branch. This PR reverts the changes
introduced in PR #2894 Initially, due to performance issues with the
older version of the chunked prefill ops, the default behavior was to
use the Ascend scheduler to disable the chunked prefill feature.
However, with the improvements in the performance of the new chunked
prefill ops, this interception strategy has been removed. This change
also aligns with the community's default configuration behavior.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-11-10 20:02:23 +08:00
XiaoxinWang
c3c9138719 [Perf] Move attention update stream out of loop to optimize performance (#3985)
### What this PR does / why we need it?
In the `update_*attn_params` functions, the
`torch.npu.stream(update_stream)` context manager was previously located
inside the for-loop that updates parameters for each layer. This
resulted in redundant stream initiations for every layer, adding
unnecessary overhead.

This commit refactors the code by moving the stream context manager to
wrap the entire for-loop. This ensures that the update stream is
initiated only once per function call, rather than for each layer. This
change reduces 90us in each decode model.
update stream in every layer:
<img width="1720" height="383" alt="image"
src="https://github.com/user-attachments/assets/70e4cb69-5bc1-4180-a67d-c99132134be6"
/>

remove update stream in every layer:
<img width="1269" height="175" alt="image"
src="https://github.com/user-attachments/assets/0e290edb-b0ce-48fe-b032-1b924ade6ae5"
/>

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
2025-11-10 17:18:45 +08:00
zhangxinyuehfad
d913f9474b [0.11.0][Fix] Fix Qwen2-Audio-7B-Instruct accuracy test (#4018)
### What this PR does / why we need it?

Fix Qwen2-Audio-7B-Instruct accuracy test

Backport:https://github.com/vllm-project/vllm-ascend/pull/4017

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-11-10 11:54:30 +08:00
hucong
7ea17fbee3 [0.11.0][BugFix] Improve the performance of prefixcache features (#4021)
### What this PR does / why we need it?
cherry-pick from https://github.com/vllm-project/vllm-ascend/pull/4022

The code bug caused an empty bubble. When the npu_paged_cache_load
operator was called, it forcibly transferred seq_len2 to the device,
which triggered synchronization and interrupted the CPU operator's
launch stream.


---------

Signed-off-by: underfituu <hzhucong@163.com>
2025-11-10 11:51:34 +08:00
wangxiaoteng888
c2d58c0655 [P/D][BugFix][v0.11.0-dev]Fix proxy format processing errors & Layerwise connector performance optimization (#4069)
### What this PR does / why we need it?
1.Fix proxy format processing errors.
2.Layer-wise connector performance optimization

Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
2025-11-09 09:55:10 +08:00
wangx700
55e37f5041 [v0.11.0][Bugfix] fix sleepmode level2 e2e test (#4023)
<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.

- Please clarify why the changes are needed. For instance, the use case
and bug description.

- Fixes #
-->
fix sleepmode level2 e2e test

### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
no

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->
use e2e tests

Signed-off-by: wangx700 <wangxin700@huawei.com>
2025-11-08 14:11:15 +08:00
tingfu
f9842560cb [0.11.0][Perf] Add padding vision tower for Qwen2_5_Omni (#4041)
### What this PR does / why we need it?
This PR repalce the vision tower in Qwen2.5-Omni-Thinker model,
Qwen2_5_VisionTransformer, with AscendQwen2_5_VisionTransformer, which
use QKV padding for padding performance.

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: Ting FU <futing10@huawei.com>
2025-11-08 13:56:05 +08:00
zxr2333
d4e2a44307 [Cherry Pick from pr#3981][0.11.0][P/D]Make kv-transfer env variable take effect & Fix load-balance proxy (#3983)
### What this PR does / why we need it?
Make kv-transfer env variable take effect & Fix load-balance proxy.
Cherry Pick from #3981

---------
Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
2025-11-08 13:52:33 +08:00
offline893
8e72758645 [BugFix]Fix grouplist type of mc2. (#4049)
### What this PR does / why we need it?
Fix accrucy problem of eplb because of PTA upgrade. This is a backport
of #4047

### How was this patch tested?
Mian:
    baseline:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 87.50 |

   EPLB:

| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 87.50 |
- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: offline0806 <3337230449@qq.com>
Co-authored-by: offline0806 <3337230449@qq.com>
2025-11-07 17:43:23 +08:00
lilinsiman
016337eaec [v0.11.0][UT] Add new ut case for aclgraph enable (#4038)
### What this PR does / why we need it?
add new ut case for aclgraph enable

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-11-07 11:35:24 +08:00
Angazenn
f9494d978a [cherry-pick][v0.11.0-dev][bugfix] Fix a rare bug triggered by _npu_paged_attention in FULL_DECODE_ONLY mode (#3987)
### What this PR does / why we need it?
This is cherry-pick from #3986 . 

This PR fixes a bug where the workspace of `_npu_paged_attention` in
setup is smaller than execution. For current implementation of
FULL_DECODE_ONLY with `_npu_paged_attention`, we use
`_npu_paged_attention_get_workspace` when capturing with `max_model_len`
as `seq_lens`. This assumes that PA with larger `seq_lens` inputs should
have larger workspace than smaller `seq_lens`. However, there are rare
cases where PA with smaller `seq_lens` incurs larger space. So I add
`get_workspace` directly into `update_attn_params`.
This change might introduce slight(≈1%) performance degradation for
small num_tokens(such as 1) in decode phase, and there is no other known
memory issues. So I think this change is acceptable. We can remove this
if new attention op (such as `npu_fused_infer_attention_score`) does not
have such problems.


Signed-off-by: Angazenn <supperccell@163.com>
2025-11-06 23:08:57 +08:00
Shanshan Shen
27547a10e6 [MM][Bugfix] Add MoE verification for multi-modal models (#3897) (#4027)
### What this PR does / why we need it?

Fix #3891.

The empty of `moe_comm_method` in the above issue is due to the wrong
check for MoE models. To be specific, the method `is_moe_model` only
checks whether a text-only model is a MoE model, without considering
multi-modal models, e.g., `VL` and `Omni`.

Check the config dict recursively to find if it has a key contains
"expert", without checking the model architecture.

It is worth noting that, we can't verify a model by if it contains
`FusedMoE` module because `is_moe_model` is called somewhere before the
model loading, e.g., it's called when updating the ACLGraph config in
platform initialization.

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

---------

Signed-off-by: shen-shanshan <467638484@qq.com>
2025-11-06 20:30:40 +08:00
zzzzwwjj
3db53d117e [0.11.0][doc] add aclgraph developer guide (#3947)
### What this PR does / why we need it?
Add aclgraph developer guide.

Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-06 09:54:38 +08:00
wangxiyuan
7ee0b0b5d8 [cherry-pick]Upgrade CANN to 8.3.rc1 (#3945) (#3962)
This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version
check logic.

TODO: we notice that UT runs failed with CANN 8.3 image. So the base
image for UT is still 8.2. We'll fix it later.

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-06 09:05:08 +08:00
Zetong Li
66b67f9cf2 [Bugfix][SHM] Fix weak memory ordering problem in share memory (#3988)
### What this PR does / why we need it?
This PR aims to fix weak memory ordering problem in share memory by
patching message queue with an additional lock. The detailed issue can
be found here https://github.com/vllm-project/vllm/issues/27858. The key
point is to use the writer lock to enforce memory fence before the ready
flag `metadata_buffer[0] = 1` is set.

This is a temporary solution, and you can use it by setting env
`SHM_BARRIER=true`. By default, we disable this modification.

### Does this PR introduce _any_ user-facing change?
`SHM_BARRIER=true` enables this change while `SHM_BARRIER=false`
disables this change. The latter is the default choice.

### How was this patch tested?
by ci

---------

Signed-off-by: Zetong Li <slippersss@126.com>
2025-11-04 23:07:23 +08:00
zxr2333
954dab64fb [v0.11.0][P/D]Set adxl as default backend and update readme (#3771)
### What this PR does / why we need it?
Set adxl engine as the default Mooncake backend, because Ascend
Transport is no longer maintained.
Update README to include instructions for installing the adxl backend
Mooncake.

### Does this PR introduce _any_ user-facing change?
Users need to compile and install the mooncake backend for adxl
according to the revised README instructions.

### How was this patch tested?
By CI.

---------

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
2025-11-04 16:06:58 +08:00
leo-pony
0cead5c1ee Quality enhancement: Immediately interrupt execution when allocate NPU memory OOM (#3944)
### What this PR does / why we need it?
Protect the scene where the first problem occurs. The execution should
be interrupted when the video memory application fails, rather than
waiting until an illegal address is accessed.


### Does this PR introduce _any_ user-facing change?
NA

### How was this patch tested?
NA
- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: leo-pony <nengjunma@outlook.com>
2025-11-04 08:55:22 +08:00
Mengqing Cao
7cc6208029 [0.11.0][MTP][Aclgraph] Fix the support aclgraph with MTP (#3912)
### What this PR does / why we need it?
Fix 2 breaks of aclgraph with MTP:
1. deepseekmtp in vllm 0.11.0 does not support aclgraph and lack the
`support_torch_compile` decorator
2. There is a d2h synchornization in the original forward of mtp
predictor. The fix pr in vllm
https://github.com/vllm-project/vllm/pull/27643

As we'll fix it in vllm main, this fix pr is only needed in branch
v0.11.0-dev

The profling shows that MTP replays in aclgraph now:
<img width="1612" height="1866" alt="a7d7f04155df4ed454b7eb20a92b2e2a"
src="https://github.com/user-attachments/assets/eaa4b9ff-aeb0-416d-964f-5a06e497f155"
/>

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
2025-11-03 14:25:37 +08:00
wangxiyuan
8a7154001e [0.11.0]Chery pick pta upgrade change (#3940)
This PR cherry-pick two commit from main to upgrade torch-npu to 2.7.1
official release

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-31 22:14:26 +08:00
rjg-lyh
3d81ea03ed [v0.11.0-dev][bugfix] fix valueError in static_forward_context when prefix is empty (#3929)
### What this PR does / why we need it?
This PR temporarily bypasses the scenario where some models in vLLM
trigger a `ValueError` during the process of storing values in
`static_forward_context` when no `prefix` is specified for the linear
layers, which is a bug in some models in vLLM. The official fix will be
addressed by submitting a PR to the vLLM community that specifies a
prefix for the linear layers in each model.

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-10-31 15:45:06 +08:00
Nagisa125
9f7de45b75 [Bugfix] fix MTP support for lmhead_tensor_parallel_size (#3921)
### What this PR does / why we need it?
Fix the issue of MTP being enabled and setting
Imhead_tensor_parallel_size=16 causing the inference to hang.


Signed-off-by: wyh145 <1987244901@qq.com>
2025-10-31 14:34:28 +08:00
lilinsiman
ee2e55e602 [v0.11.0][Test] Add new test model for aclgraph single_request v0.11.0 (#3889)
### What this PR does / why we need it?
add new test model for aclgraph single_request v0.11.0

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-10-31 11:23:55 +08:00
zouyida2052
90aca84e60 fix bug when max_seqs=14 in mtp=2 scenario and raise error when cudagraph_capture_sizes can't be an integer multiple of uniform_decode_query_len (#3909)
### What this PR does / why we need it?
1. Revert [bugfix for mtp in
fullgraph](0948483642)
and support it when vllm supports
2. raise error when cudagraph_capture_sizes can't be an integer multiple
of uniform_decode_query_len
3. bugfix when max_num_seqs=14 in mtp=2 scenario

---------

Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
2025-10-31 09:25:06 +08:00
lilinsiman
387ce1cc5b add new e2e tests case for aclgraph memory to v0.11.0 (#3880)
<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
add new e2e tests case for aclgraph memory to v0.11.0

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
2025-10-31 09:17:09 +08:00
wangxiaoteng888
38afd2c9cb [bugfix_v0.11.0]cancel tokenize for layerwise_proxy (#3913)
### What this PR does / why we need it?
cancel tokenize for layerwise_proxy
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
by ci

Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
2025-10-30 23:55:04 +08:00
wangxiaoteng888
af7a56550b [bugfix_v0.11.0-dev] layerwise D first plan (#3907)
### What this PR does / why we need it?
Refactored the layerwise code to send to the D node first, preventing
P-node hangs due to communication timeouts when DP > 1.
---------

Signed-off-by: nwpu-zxr <zhouxuerong2@huawei.com>
Signed-off-by: liziyu <liziyu16@huawei.com>
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
Co-authored-by: nwpu-zxr <zhouxuerong2@huawei.com>
Co-authored-by: liziyu <liziyu16@huawei.com>
2025-10-30 22:21:11 +08:00
offline893
d5a9aba03f [BugFix]Fix group list type of mc2. (#3890)
### What this PR does / why we need it?
Fix the precision issue caused by the inconsistency between the group
list type used by mc2 and that of eplb.

---------

Signed-off-by: offline0806 <3337230449@qq.com>
2025-10-30 21:44:14 +08:00
weichen
c506ba60fb [v0.11.0] [Bugfix] [MoE]fix error in deepseek when using allgather (#3827)
### What this PR does / why we need it?
After refactoring vllm_ascend/models and FusedMoE, we are unable to pass
`gate` from deepseekv2.py to `AscendFusedMoE.forward`, which will result
in error when running deepseek v3/r1 with allgather.
Hence, this pr removes `gate` related computations from FusedMoE module
in eager/aclgraph mode.
### Does this PR introduce _any_ user-facing change?
`rm_router_logits` is deprecated in eager/aclgraph.
### How was this patch tested?
e2e & ut

Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
2025-10-30 14:59:46 +08:00
whx
211d4b9da4 [BugFix] Fix mlapo accuracy problem related with weight processing. (#3857)
This PR fixes a mlapo accuracy problem related with weight processing.
Furthermore, modify mlapo related e2e test with quantized deepseek model
to make it effective.

Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-30 00:35:50 +08:00
zouyida2052
d9249c968e bugfix for mtp in fullgraph (#3878)
### What this PR does / why we need it?
bugfix for mtp in fullgraph

### Does this PR introduce _any_ user-facing change?
no

---------

Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
2025-10-29 23:52:20 +08:00
fems14
19f49ecb5f [0.11.0][Bugfix]fix_mulit_connector_bug (#3332) (#3882)
### What this PR does / why we need it?
When using multi connector, the multi connector does not define
get_finished_count, which will cause the kv cache to be released ###
Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19


Signed-off-by: baxingpiaochong <771405853@qq.com>
Co-authored-by: baxingpiaochong <771405853@qq.com>
2025-10-29 23:44:52 +08:00
liziyu
e5b938c5fe [v0.11.0] [P/D] force with_prefill true after allreduce in kv producer (#3835)
### What this PR does / why we need it?
force with_prefill true after allreduce in kv producer. This is a backport of #3768 and #3849

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
2025-10-29 23:14:00 +08:00
Wang Yixuan
b323be9fe4 deepseek torchair adapt for torch_npu version (#3876)
### What this PR does / why we need it?
To adapt the torch_npu version to avoid the precision problem of
torchair deepseek. The torch_npu version may result in the different
branches in the ops register, the rms_norm ops has two branches
according to the verson_check, this pr unify the rms_norm in torchair by
patch method. #3862

Signed-off-by: hust17yixuan <303660421@qq.com>
2025-10-29 22:44:44 +08:00
realliujiaxu
29bd9235ed [v0.11.0][Perf] Delete redundant operations in model_runner and forward_context (#3775)
<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->

cherry pick https://github.com/vllm-project/vllm-ascend/pull/3677

Remove redundant operations from `model_runner` and `forward_context`.
This optimization can significantly reduce the idle time (bubble) before
decoding when running models with small parameter counts (e.g.,
Qwen/Qwen2.5-0.5B).

Testing on 800I A2, bubble is reduced from 3.8ms to 2.8ms :
Before
<img width="1655" height="696" alt="image"
src="https://github.com/user-attachments/assets/d7608e52-2438-46dd-8fc9-391fd6274495"
/>

After
<img width="1607" height="774" alt="image"
src="https://github.com/user-attachments/assets/56daf081-2dba-4d2e-99d4-e055187d9806"
/>
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.

- Please clarify why the changes are needed. For instance, the use case
and bug description.

- Fixes #
-->

### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->
No
### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

---------

Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-10-29 15:58:53 +08:00
zhangxinyuehfad
75de3fa172 [v0.11.0][Doc] Update doc (#3852)
### What this PR does / why we need it?
Update doc


Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-10-29 11:32:12 +08:00
ZYang6263
6188450269 [v0.11.0][Bugfix]Avoid using the fusion operator in the MOE model (#3837)
### What this PR does / why we need it?
The current MatmulReduceScatter operator experiences performance
degradation in small-shape scenarios, so it determines whether to use
this operator by judging the size of the shape.


---------

Signed-off-by: ZYang6263 <zy626375@gmail.com>
2025-10-28 23:31:19 +08:00
Shirley125
e48ca0b6ec [bugfix][0.11]fix proxy decode bug (#3751)
### What this PR does / why we need it?
fix proxy decode bug while parsing non-UTF-8 characters.

---------

Signed-off-by: CHEN <116010019@link.cuhk.edu.cn>
2025-10-27 16:56:50 +08:00
Yizhou
43276fd822 [v0.11.0][Fix] Prevent memory leak in MLA decode graph (#3743) (#3774)
### What this PR does / why we need it?
The cache for MLA decode graph parameters was holding strong references
to tensors, preventing them from being garbage collected and leading to
increased memory usage.

This change wraps the cached tensors in weak references, allowing them
to be deallocated when no longer in use and reducing overall memory
pressure.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
None.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-27 16:00:20 +08:00
Ruri
825fdfb197 [v0.11.0][Feat] Prefetching Attention QKV Linear Weight With AddRmsNormQuant Custom Op (#3649)
### What this PR does / why we need it?

- `qkv_proj.weight` prefetching has been implemented with `Quant` op,
when `AddRmsNormQuant` is enabled (#3465) `qkv_proj.weight` prefetching
won't work
- Implement `qkv_proj.weight` prefetching with `AddRmsNormQuant`, which
has been merged on `main` branch (#3517)

### Does this PR introduce _any_ user-facing change?

None.

### How was this patch tested?

Tested on `Qwen3-235B-A22B-W8A8`
<img width="1868" height="109" alt="image"

src="https://github.com/user-attachments/assets/0bc28082-0287-4d5c-b8f6-f907c3134d36"
/>


- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.

- Please clarify why the changes are needed. For instance, the use case
and bug description.

- Fixes #
-->

### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
2025-10-27 09:42:09 +08:00
Mengqing Cao
1b16c01afd [v0.11.0-dev][Installation] limit opencv-python-headless version to resolve numpy version conflict (#3767)
### What this PR does / why we need it?
vllm requires opencv-python-headless >= 4.11.0 which requires
(numpy<2.3.0,>=2), but vllm-ascend numpy version must be less than
2.0.0, so limit opencv-python-headless less than 4.11.0.86 will fix this
conflict.

backport of
afc58184ec

Signed-off-by: 22dimensions <waitingwind@foxmail.com>
Co-authored-by: 22dimensions <waitingwind@foxmail.com>
2025-10-25 18:18:28 +08:00
whx
a58ff9e92f [Cherry-pick] Port MoE multi-stream fix to v0.11.0-dev (#3753)
This PR moves the communication operation of shared experts out of extra
stream because I found that this might cause rtMemcpy related errors
when running shared experts multistream with aclgraph.

Furthermore, I utilize a global variable as extra stream object to avoid
allocating streams for each layer in full-graph mode.

Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-25 15:51:43 +08:00
Yizhou
1bc61031e5 [v0.11.0][Fix] Cap max tokens to prevent potential OOM (#3720) (#3744)
### What this PR does / why we need it?
Caps the calculated maximum number of tokens at 512.

This prevents allocating an excessively large buffer when a cudagraph
capture size is not specified, mitigating the risk of out-of-memory
errors.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
None.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-25 15:46:56 +08:00
fems14
99e154dc84 [0.11.0] cherry-pick from #3747 (#3746)
cherry-pick from #3747

correct _register function place for mooncacke

Signed-off-by: fems14 <1804143737@qq.com>
2025-10-25 14:21:30 +08:00
shaopeng-666
fed8145aea [cherry-pick][Feat] Add mrope fusion op#3708 (#3735)
### What this PR does / why we need it?
Add mrope fusion op for qwen2.5-vl. This mrope operator dosen't
support Qwen3-VL currently. Thus could only take affect in qwen2.5-vl
cherry pick from 39b994a987

CI passed with existing test

Signed-off-by: shaopeng666 <shaopeng666@noreply.gitcode.com>
Co-authored-by: shaopeng666 <shaopeng666@noreply.gitcode.com>
2025-10-25 11:41:23 +08:00
whx
0644113c35 [BugFix] cherry-pick PR 3736 to v0.11.0-dev (#3737)
This PR comments out newly added vlm e2e test of ascend scheduler
scenario because I found that when running in multi-batch this will
stuck. Need to add this back after dealing with this issue.

Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-25 10:35:14 +08:00
whx
5a2c5be229 [BugFix][Cherry-pick] Cherry-pick PR 3675 to v0.11.0-dev (#3732)
This PR cherry-picks the bugfix related with running multi-modal models
with AscendScheduler to v0.11.0-dev

Signed-off-by: hw_whx <wanghexiang7@huawei.com>
Co-authored-by: hw_whx <wanghexiang7@huawei.com>
2025-10-25 09:41:51 +08:00
hucong
12bc78d252 [v0.11.0][BugFix][P/D] Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache (#3686)
### What this PR does / why we need it?
Modify the recalculation logic to prevent waiting requests from filling
up the D node KVCache

Signed-off-by: underfituu <hzhucong@163.com>
2025-10-25 09:15:42 +08:00
ZYang6263
5c0a23f98b [0.11.0][Perf] Add fused matmul/reduce-scatter kernel for performance optimization. (#3725)
### What this PR does / why we need it?
This PR boosts performance by introducing a fused kernel for the matrix
matmul and reduce scatter operations. It supports both unquantized
(e.g., BFloat16) and W8A8 quantized models.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.

- Please clarify why the changes are needed. For instance, the use case
and bug description.

- Fixes #
-->

### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

Signed-off-by: ZYang6263 <zy626375@gmail.com>
2025-10-25 08:20:43 +08:00
fems14
17dd9ae42c [0.11.0][bugfix]look up multi_tp key (#3699) (#3723)
### What this PR does / why we need it?
In multi-Tensor Parallel (TP) scenarios, the KV pool only queries the
first GPU card. When keys on other cards are released, the query result
still returns as successful, introducing accuracy issues. This PR
modifies the KV pool's query logic to check all cards, resolving this
problem.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: fems14 <1804143737@qq.com>
2025-10-24 18:22:45 +08:00
fems14
f0eb3e1d97 [v0.11.0][bugfix]kvpool sync load (#3698) (#3722)
### What this PR does / why we need it?
In certain scenarios, the performance of synchronously loading data from
the pool is better than that of asynchronously loading data. Therefore,
a control logic (or switch) for asynchronous loading from the pool has
been added.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

Signed-off-by: fems14 <1804143737@qq.com>
2025-10-24 18:21:46 +08:00
何必问
33514a4cc2 [Bugfix] The server fails to locate the request, leading to the server hanging. (#3721)
### What this PR does / why we need it?
fix bug: In the mooncake pooling scenario, when the client closes the
request, the server fails to locate the request, leading to the server
hanging.oling scenario, when the client closes the request, the server
fails to locate the request, leading to the server hanging.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
Pull up the PD separated pooling service, send requests using aisbench,
press CTRL+C twice, and check if the vllm_ascend service exit.

---------

Signed-off-by: linhebiwen <linhebiwen@gmail.com>
2025-10-24 17:41:29 +08:00
offline893
4e21b1537e [BugFix] Check all expert maps when using muilty instance. (#3662)
### What this PR does / why we need it?
Check all expert maps when using muilty instance.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Qwen 235B in double A3.
case1:master has expert map, slave has not expert map.
case2:   master has expert map, slave has error expert map.
case3:   master has expert map,slave has correct expert map.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: offline0806 <3337230449@qq.com>
Co-authored-by: offline0806 <3337230449@qq.com>
2025-10-24 17:10:31 +08:00
wangxiyuan
b321e3846a [cherry-pick]【main】patch sched_yield (#3648) (#3687)
### What this PR does / why we need it?
On Arm systems, os.sched_yield() does not take effect, causing the GIL
(Global Interpreter Lock) to remain unrelinquished and resulting in CPU
bound issues. This PR applies a patch to sched_yield in vLLM, making the
process execute time.sleep(0) instead to release the GIL. ### Does this
PR introduce _any_ user-facing change?

Signed-off-by: fems14 <1804143737@qq.com>
Co-authored-by: fems14 <74094523+fems14@users.noreply.github.com>
2025-10-24 00:24:58 +08:00
Wang Yixuan
d0086d432a fix deepseek torchair recompile (#3679)
### What this PR does / why we need it?
The #3624 PR fix the precision of deepseek torchair, but don't consider
the limitation of torch compile which results in the recompile, This PR
fixs this problem. PR to main #3678


### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

Signed-off-by: hust17yixuan <303660421@qq.com>
2025-10-23 22:53:13 +08:00
Slightwind
d2d19a4c3c [v0.11.0][bugfix] Add 'layer_type' param to get_pergroup_param() for compatibility (#3684)
Resolves a `TypeError: got an unexpected keyword argument 'layer_type'`.

A recent change (PR #3311) started passing the `layer_type` argument
when calling `get_pergroup_param()`. This specific implementation does
not use this parameter, causing the error.

This patch adds `layer_type=None` to the method signature to maintain
API compatibility and ignore the unused argument.

Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2025-10-23 21:26:50 +08:00
liziyu
f3ea657e93 [0.11.0][Bugfix] fix delay free prefill req & D node support prefix cache (#3609)
### What this PR does / why we need it?
Fix mooncake connector. In scenarios where TP is not equal, when the
prefill TP size is less than the number of key-value heads,
_get_remote_tp_ranks_for_req will return a list of np.arrays. Performing
an operation like int in list of np.arrays will cause an error.
Converting the list of np.arrays into a single np.array resolves this
issue.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
qwen235B
P tp16, D tp1
P tp8, D tp1
P tp4, D tp1
P tp8, D tp2


- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
2025-10-23 20:39:35 +08:00
ZYang6263
6975d46627 [v0.11.0][Perf] Eliminating the zerolike operator through patch (#3632)
### What this PR does / why we need it?
There is a zero-like operator before the attention operation in each
decoding stage. After analysis, this operator can be eliminated. The
purpose of this PR is to remove this operator and improve performance.

---------

Signed-off-by: ZYang6263 <zy626375@gmail.com>
2025-10-23 14:49:28 +08:00
rjg-lyh
74903af460 [v0.11.0][refactor] refactor SequenceRowParallelOp forward (#3654)
### What this PR does / why we need it?
This PR refactors SequenceRowParallelOp forward. In order to further
expand the operator inclusion scope in dynamic judgment scenarios, this
PR customizes the entire matmul computation and communication as a
custom operator masking. With this refactor, it will support directly
writing code such as common operation fusion into the
SequenceRowParallelOp class's member function matmul_and_reduce, without
the need to register more redundant custom masking operators.

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: rjg-lyh <1318825571@qq.com>
2025-10-23 14:45:49 +08:00
Yizhou
54bd531db8 [v0.11.0][Fix] Fix attention metadata handling for profiling and MLA (#3636) (#3643)
### What this PR does / why we need it?
This is a port PR of #3636 .

Move the creation of dummy attention metadata to occur after the ACL
graph runtime mode is determined. This ensures the metadata is
initialized with the correct configuration during a profile run.

Additionally, remove the `attn_metadata` existence check before updating
MLA attention parameters. This change prevents the update from being
skipped when metadata is not yet available, ensuring parameters are set
correctly.

### Does this PR introduce _any_ user-facing change? None.

### How was this patch tested?
None.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-23 10:29:30 +08:00
whx
6464c97ff9 [BugFix][v0.11.0] Fix quantization related mtp bug with patch (#3619)
vLLM 0.11.0 didn't bring PR
(https://github.com/vllm-project/vllm/pull/25805) thus missing the
prefix of mtp's SharedHead. This PR fixes this bug with a patch to
vllm's deepseek_mtp.

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-22 23:06:09 +08:00
Zetong Li
6e72bfdc50 [v0.11.0] cherry-pick Fix performance degradation when mtp>1 (#3597) (#3630)
### What this PR does / why we need it?
cherry-pick Fix performance degradation when mtp>1 (#3597)

This PR aims to fix performance degradation when mtp>1. Since mtp>1 may
result in more tokens (i.e. larger batch size) than acl graph maximum
batch size, this will cause draft model to run in eager mode.

### How was this patch tested?
by ci

---------

Signed-off-by: Zetong Li <slippersss@126.com>
2025-10-22 22:07:39 +08:00
zouyida2052
a989fef5de unify logic between aclgraph and torchair (#3602)
### What this PR does / why we need it?
unify logic between aclgraph and torchair. This is a cherry-pick of https://github.com/vllm-project/vllm-ascend/pull/3560

Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
2025-10-22 21:55:06 +08:00
Wang Yixuan
edccd46d74 fix deepseek torchair precision (#3635)
### What this PR does / why we need it?
The precision of deepseek torchair is broken by #3465 , which due to the origin patch or rmsnorm in torchair. This PR fixes the precision of deepseek torchair.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

Signed-off-by: hust17yixuan <303660421@qq.com>
2025-10-22 20:20:32 +08:00
Yizhou
984efdc0d0 [v0.11.0][Fix] Fixes attribute error in MLA implementation (#3617)
### What this PR does / why we need it?
Corrects the attribute access for retrieving the device from `q_a_proj`
to `q_proj`. This prevents an `AttributeError` as `q_a_proj` does not
exist on the class instance.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
Need MLAPO tests.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-22 15:49:18 +08:00
wangxiyuan
a0c3b8dd2d [v0.11.0]cherry-pick fix ut (#3608) (#3614)
cherry-pick fix ut (#3608)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-22 14:14:15 +08:00
offline893
726bc8aa2a [CI]fix test nightly workflow. (#3604)
Add the nightly test back, it's deleted by mistake.

Co-authored-by: offline0806 <3337230449@qq.com>
2025-10-22 10:34:03 +08:00
373 changed files with 36100 additions and 3752 deletions

View File

@@ -30,7 +30,7 @@ jobs:
runs-on: ${{ inputs.runner }}
name: ${{ inputs.model_name }} accuracy
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
env:
VLLM_USE_MODELSCOPE: True
# 1. If version specified (work_dispatch), do specified branch accuracy test

View File

@@ -89,6 +89,7 @@ jobs:
# the test separately.
pytest -sv tests/e2e/singlecard/test_aclgraph.py
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
pytest -sv tests/e2e/singlecard/test_bge_model.py
pytest -sv tests/e2e/singlecard/test_camem.py
@@ -105,8 +106,8 @@ jobs:
# ------------------------------------ v1 spec decode test ------------------------------------ #
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
# Fix me: OOM error
#pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
# Fix me: test_eagle_correctness OOM error
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
pytest -sv tests/e2e/singlecard/ops/

View File

@@ -68,5 +68,5 @@ jobs:
with:
vllm: v0.11.0
runner: linux-aarch64-${{ matrix.runner }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
model_name: ${{ matrix.model_name }}

View File

@@ -23,7 +23,7 @@ jobs:
# This is a runner with no NPU for k8s controller
runs-on: linux-aarch64-a3-0
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl

View File

@@ -56,7 +56,7 @@ jobs:
vllm_use_v1: 1
max-parallel: 1
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
volumes:
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi

View File

@@ -57,7 +57,13 @@ jobs:
- name: Print
run: |
lscpu
- name: Free up disk space
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
with:
tool-cache: true
docker-images: false
- name: Build wheel
run: |
ls

View File

@@ -47,7 +47,7 @@ jobs:
name: vLLM Ascend test
runs-on: ${{ matrix.os }}
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
env:
DEBIAN_FRONTEND: noninteractive
steps:
@@ -97,4 +97,4 @@ jobs:
VLLM_USE_MODELSCOPE: True
run: |
# TODO: enable more tests
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe

View File

@@ -75,7 +75,7 @@ jobs:
name: unit test
# only trigger unit test after lint passed and the change is e2e and ut related.
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
runs-on: ubuntu-latest
runs-on: ubuntu-22.04-arm
container:
image: quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
env:
@@ -100,7 +100,7 @@ jobs:
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
VLLM_TARGET_DEVICE=empty python3 -m pip install .
python3 -m pip uninstall -y triton
- name: Checkout vllm-project/vllm-ascend repo
@@ -109,18 +109,18 @@ jobs:
- name: Install vllm-project/vllm-ascend
run: |
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/devlib
python3 -m pip install -r requirements-dev.txt
python3 -m pip install -v .
- name: Run unit test
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
TORCH_DEVICE_BACKEND_AUTOLOAD: 0
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/arm64-linux/devlib
pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut \
--ignore tests/ut/attention/test_attention_v1.py
- name: Upload coverage to Codecov
# only upload coverage when commits merged
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
@@ -145,5 +145,5 @@ jobs:
with:
vllm: ${{ matrix.vllm_version }}
runner: linux-aarch64-a2
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
type: light

View File

@@ -58,7 +58,7 @@ jobs:
runs-on: ${{ matrix.os }}
container:
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-310p-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-310p-ubuntu22.04-py3.11
env:
VLLM_LOGGING_LEVEL: ERROR
VLLM_USE_MODELSCOPE: True

View File

@@ -76,5 +76,5 @@ jobs:
with:
vllm: ${{ matrix.vllm_version }}
runner: linux-aarch64-a2
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
type: full

View File

@@ -41,5 +41,5 @@ jobs:
with:
vllm: main
runner: linux-aarch64-a2
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
type: full

View File

@@ -79,7 +79,7 @@ jobs:
with:
vllm: v0.11.0
runner: linux-aarch64-${{ matrix.runner }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
model_name: ${{ matrix.model_name }}
upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}

View File

@@ -49,7 +49,7 @@ jobs:
runs-on: linux-arm64-npu-static-8
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
volumes:
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
@@ -109,4 +109,4 @@ jobs:
- name: Run vllm-project/vllm-ascend PD Disaggregation edge test
run: |
git config --global --add safe.directory/__w/vllm-ascend/vllm-ascend
bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
bash tests/e2e/pd_disaggreate/run_edge_case_test.sh

View File

@@ -1,115 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
name: 'e2e nightly test'
on:
workflow_call:
inputs:
vllm:
required: true
type: string
runner:
required: true
type: string
image:
required: false
type: string
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
tests:
required: true
type: string
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 1 card / 4 cards test type
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e-nightly:
name: e2e-nightly
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.image }}
env:
VLLM_USE_MODELSCOPE: True
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y
apt install git -y
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: ${{ inputs.vllm }}
path: ./vllm-empty
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
- name: Checkout aisbench repo and Install aisbench
run: |
git clone https://gitee.com/aisbench/benchmark.git
cd benchmark
git checkout v3.0-20250930-master
pip3 install -e ./
pip3 install -r requirements/api.txt
pip3 install -r requirements/extra.txt
- name: Run vllm-project/vllm-ascend test
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }}
run: |
# TODO: enable more tests
pytest -sv ${{ inputs.tests }}

View File

@@ -1,86 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
name: 'ascend test / nightly'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
- '*-dev'
paths:
- 'tests/e2e/nightly/**'
- '.github/workflows/vllm_ascend_test_nightly.yaml'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 1 card / 4 cards test type
concurrency:
group: ascend-nightly-${{ github.ref }}
#cancel-in-progress: true
jobs:
qwen3-32b:
strategy:
matrix:
# should add A3 chip runner when available
os: [linux-aarch64-a2-4]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
# only trigger e2e test after lint passed and the change is e2e related with pull request.
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
tests: tests/e2e/nightly/models/test_qwen3_32b.py
qwen3-235b-a22b-w8a8-eplb:
strategy:
matrix:
# should add A3 chip runner when available
os: [ linux-aarch64-a3-16 ]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
# only trigger e2e test after lint passed and the change is e2e related with pull request.
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
deepseek-r1-w8a8-eplb:
strategy:
matrix:
# should add A3 chip runner when available
os: [ linux-aarch64-a3-16 ]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
# only trigger e2e test after lint passed and the change is e2e related with pull request.
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py

View File

@@ -12,8 +12,8 @@ repos:
- id: codespell
args: [
--toml, pyproject.toml,
'--skip', 'tests/e2e/multicard/test_torchair_graph_mode.py,csrc/mla_preprocess/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml',
'-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND'
'--skip', 'tests/e2e/multicard/test_torchair_graph_mode.py,csrc/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml',
'-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND,ND'
]
additional_dependencies:
- tomli

View File

@@ -20,6 +20,13 @@ set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
find_package(Torch REQUIRED)
run_python(TORCH_VERSION
"import torch; print(torch.__version__)" "Failed to locate torch path")
# check torch version is 2.7.1
if(NOT ${TORCH_VERSION} VERSION_EQUAL "2.7.1")
message(FATAL_ERROR "Expected PyTorch version 2.7.1, but found ${TORCH_VERSION}")
endif()
set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
set(SOC_VERSION ${SOC_VERSION})
message(STATUS "Detected SOC version: ${SOC_VERSION}")
@@ -48,15 +55,35 @@ include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
file(GLOB KERNEL_FILES
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp)
ascendc_library(vllm_ascend_kernels SHARED
set(VLLM_ASCEND_CUSTOM_OP
${KERNEL_FILES}
${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
)
set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
)
if(SOC_VERSION STREQUAL "ASCEND310P3")
list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE})
endif()
ascendc_library(vllm_ascend_kernels SHARED
${VLLM_ASCEND_CUSTOM_OP}
)
message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
if(SOC_VERSION STREQUAL "ASCEND310P3")
file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
else()
file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
endif()
include_directories(
${pybind11_INCLUDE_DIRS}
@@ -66,6 +93,8 @@ include_directories(
${ASCEND_HOME_PATH}/include
${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include
)
set(

View File

@@ -15,24 +15,33 @@
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2"
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
# Install Mooncake dependencies
RUN apt-get update -y && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
@@ -40,7 +49,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
@@ -50,11 +59,17 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -15,7 +15,7 @@
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
@@ -40,7 +40,7 @@ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge

View File

@@ -15,7 +15,7 @@
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
@@ -38,7 +38,7 @@ ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge

View File

@@ -15,32 +15,40 @@
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG=v0.3.7.post2
COPY . /vllm-workspace/vllm-ascend/
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install Mooncake dependencies
RUN apt-get update -y && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
@@ -54,7 +62,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]
CMD ["/bin/bash"]

View File

@@ -15,30 +15,43 @@
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2"
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
SHELL ["/bin/bash", "-c"]
RUN yum update -y && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
ARCH=$(uname -m) && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
cd /vllm-workspace/Mooncake && \
bash mooncake_installer.sh -y && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/yum/*
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
@@ -52,7 +65,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]
CMD ["/bin/bash"]

69
Dockerfile.backup Normal file
View File

@@ -0,0 +1,69 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2"
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
# Install Mooncake dependencies
RUN apt-get update -y && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -15,16 +15,14 @@
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2"
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -32,13 +30,29 @@ WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
SHELL ["/bin/bash", "-c"]
RUN yum update -y && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
ARCH=$(uname -m) && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
cd /vllm-workspace/Mooncake && \
bash mooncake_installer.sh -y && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/yum/*
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
@@ -52,7 +66,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

91
README-vllm-ascend.md Normal file
View File

@@ -0,0 +1,91 @@
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
<img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
</picture>
</p>
<h3 align="center">
vLLM Ascend Plugin
</h3>
<p align="center">
| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
</p>
<p align="center">
<a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
</p>
---
*Latest News* 🔥
- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with LLaMA-Factory/verl//TRL/GPUStack to demonstrate how vLLM Ascend assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
---
## Overview
vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
## Prerequisites
- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
- OS: Linux
- Software:
* Python >= 3.9, < 3.12
* CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
* PyTorch == 2.7.1, torch-npu == 2.7.1
* vLLM (the same version as vllm-ascend)
## Getting Started
Please use the following recommended versions to get started quickly:
| Version | Release type | Doc |
|------------|--------------|--------------------------------------|
|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
## Contributing
See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
We welcome and value any contributions and collaborations:
- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
## Branch
vllm-ascend has main branch and dev branch.
- **main**: main branchcorresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
Below is maintained branches:
| Branch | Status | Note |
|------------|--------------|--------------------------------------|
| main | Maintained | CI commitment for vLLM main branch and vLLM v0.11.0 tag |
| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version |
| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
## Weekly Meeting
- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
## License
Apache License 2.0, as found in the [LICENSE](./LICENSE) file.

View File

@@ -1,90 +1,50 @@
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
<img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
</picture>
</p>
# vLLM-Ascend Multi-LLM Serving Support
<h3 align="center">
vLLM Ascend Plugin
</h3>
<p align="center">
| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
</p>
<p align="center">
<a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
</p>
---
*Latest News* 🔥
- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with LLaMA-Factory/verl//TRL/GPUStack to demonstrate how vLLM Ascend assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
---
## Overview
vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
This repository is a modified version of [vLLM-Ascend](https://github.com/vllm-project/vllm-ascend) designed to enable multiple large language models (LLMs) to share one Ascend NPU.
It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
The key feature of this project is efficient memory coordination, enabling multiple vLLM instances share and dynamically hold Ascend NPU's physical memory.
When an instance is idle, model parameters are offloaded to host memory.
Upon a new inference request, the model parameters are quickly restored to the NPUs memory (if not exist), without the need to init the engine and load the model from scratch. (For Qwen3-8B, It only causes 0.3s of additional latency to TTFT on a real restore.)
By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
## Prerequisites
## Features
- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
- OS: Linux
- Software:
* Python >= 3.9, < 3.12
* CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
* vLLM (the same version as vllm-ascend)
- **Shared NPU Usage**: Multiple vLLM instances can access the same Ascend NPU, allowing for multi-LLM serving of different LLMs.
- **Fast Memory Restore**: We decouple the virtual and physical memory allcation. Physical NPU memory is allocated and exported and shared to other LLM engines. LLM engines can restore quickly without reinitialize and memory allocation
## Getting Started
Please use the following recommended versions to get started quickly:
## Installation
| Version | Release type | Doc |
|------------|--------------|--------------------------------------|
|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
### Build from Dockerfile
## Contributing
See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
Clone this repository:
We welcome and value any contributions and collaborations:
- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
```bash
docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
```
## Branch
## Usage
vllm-ascend has main branch and dev branch.
> [!NOTE]
> Some platforms may not allow multiple containers to share the same Ascend NPU. You may try to use privilegd container to bypass this restriction and mount all NPUs, and set the env ASCEND_RT_VISIBLE_DEVICES to specify the target device to use.
- **main**: main branchcorresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
0. To share NPU, processes coordinate via shm, so you need to set all containers with `ipc=host`.
1. Start a daemon process in a standalone container, by running `vllm_vnpu_daemon` installed inside the image.
2. Start LLM services with this image, following the official usage instructions.
Below is maintained branches:
| Branch | Status | Note |
|------------|--------------|--------------------------------------|
| main | Maintained | CI commitment for vLLM main branch and vLLM v0.11.0 tag |
| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version |
| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
## Limitations
Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
- This project only support share a single NPU currently. This is also limited by the fact that HCCL cannot be shared. We haven't figure out how to bypass HCCL. *Help wanted*.
- The prefix cache will be reset when the LLM is restored, since we just simply discard the KV cache when the LLM is offloaded.
## Weekly Meeting
- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
## Roadmap
- [ ] Space-sharing.
- [ ] ...
## License

View File

@@ -43,8 +43,8 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
- 操作系统Linux
- 软件:
* Python >= 3.9, < 3.12
* CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
* CANN >= 8.3.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
* PyTorch == 2.7.1, torch-npu == 2.7.1
* vLLM (与vllm-ascend版本一致)
## 开始使用

View File

@@ -0,0 +1,123 @@
#include <iostream>
#include <string>
#include "acl/acl.h"
#include "kernel_tiling/kernel_tiling.h"
#include "tiling/platform/platform_ascendc.h"
#include "tiling/tiling_data.h"
#include "common_tiling.h"
namespace bmm_trans {
using namespace pp_matmul;
std::unordered_map<c10::string_view, uint16_t> quantModeMap = {
{"per_channel_symm", 0},
{"per_channel_asymm", 1},
{"per_token_symm", 2},
};
std::unordered_map<c10::string_view, uint16_t> formatModeMap = {
{"ND", 0},
{"NZ", 1},
};
std::unordered_map<c10::ScalarType, TensorDType> atType2tensorDType = {
{at::ScalarType::BFloat16, TensorDType::TENSOR_DTYPE_BF16},
{at::ScalarType::Half, TensorDType::TENSOR_DTYPE_FLOAT16}};
// batch size -> memory index
constexpr uint32_t MAX_CAPTURE_NUM = 1024;
template <typename MapType>
inline int GetModeVal(const MapType &mode_map, c10::optional<c10::string_view> mode_opt, c10::string_view default_mode,
const char *mode_name)
{
std::string modeStr(mode_name);
c10::string_view mode_str = mode_opt.value_or(default_mode);
auto it = mode_map.find(mode_str);
// if input mode is unsupported, use default value
TORCH_CHECK(it != mode_map.end(), modeStr, c10::str(": Unsupported mode value ", mode_str));
return it->second;
}
std::tuple<at::Tensor, uint32_t> batch_matmul_transpose_tiling(const at::Tensor &tensor_a, const at::Tensor &tensor_b, at::Tensor &tensor_c,
c10::optional<c10::string_view> format_mode,
c10::optional<c10::string_view> quant_mode)
{
auto tensorAShape = tensor_a.sizes();
auto tensorBShape = tensor_b.sizes();
auto tensorCShape = tensor_c.sizes();
uint32_t n;
uint32_t block_dim;
//auto &platform = PlatformInfo::Instance();
HardwareInfo hwInfo;
std::map<c10::ScalarType, float> dTypeMap = {{at::ScalarType::Half, 2.0}, {at::ScalarType::BFloat16, 2.0}};
at::ScalarType aType = tensor_a.scalar_type();
at::ScalarType bType = tensor_b.scalar_type();
at::ScalarType cType = tensor_c.scalar_type();
TORCH_CHECK(aType == bType && bType == cType, "tensor type is not the same");
TORCH_CHECK((aType == at::ScalarType::BFloat16) || (aType == at::ScalarType::Half),
"tensor type only support half or bf16");
TensorFormat formatMode = static_cast<TensorFormat>(GetModeVal(formatModeMap, format_mode, "ND", "format_mode"));
MatMul::QuantMode quantMode =
static_cast<MatMul::QuantMode>(GetModeVal(quantModeMap, quant_mode, "per_channel_symm", "quant_mode"));
TORCH_CHECK(tensorAShape.size() == 3, "batch size is not same between srcTensor and dstTensor");
if (formatMode == TensorFormat::TENSOR_FORMAT_ND) {
TORCH_CHECK(tensorBShape.size() == 3, "tensor shape should be dim3 in ND format");
TORCH_CHECK(tensorAShape[2] == tensorBShape[1], "tensor shape is wrong");
n = tensorBShape[2];
} else {
TORCH_CHECK(tensorBShape.size() == 4, "tensor shape should be dim4 in nz format");
TORCH_CHECK(tensorAShape[2] == tensorBShape[2], "tensor shape is wrong");
n = tensorBShape[1] * tensorBShape[3];
}
TORCH_CHECK(tensorAShape[1] == tensorBShape[0], "tensor shape is wrong");
OpShape opShape = {.batchSize = static_cast<uint32_t>(tensorAShape[1]),
.m = static_cast<uint32_t>(tensorAShape[0]),
.k = static_cast<uint32_t>(tensorAShape[2]),
.n = n};
pp_matmul::PpMatmulTilingData matmulTilingData = {
.opShape = opShape,
};
auto dType = atType2tensorDType[aType];
MatMulInfo mmInfo = {.batchSize = opShape.batchSize,
.m = opShape.m,
.k = opShape.k,
.n = opShape.n,
.dtypeA = dType,
.dtypeB = dType,
.dtypeC = dType,
.formatB = formatMode,
.mmType = MatMul::MatMulType::MATMUL_EIN_SUM,
.inDtype = dTypeMap[aType],
.outDtype = dTypeMap[cType],
.quantMode = quantMode};
GetPpMatmulTiling(mmInfo, hwInfo, block_dim, matmulTilingData);
host_utils::PpMatmulTilingCheck(matmulTilingData);
// tiling
int32_t batchIdx = opShape.m - 1;
uint32_t tilingSize = sizeof(pp_matmul::PpMatmulTilingData);
static auto global_tiling_data = at::empty(
{tilingSize * MAX_CAPTURE_NUM}, at::TensorOptions().dtype(at::kByte).device(tensor_a.options().device()));
if (batchIdx >= 0 && batchIdx < MAX_CAPTURE_NUM) {
aclrtMemcpy(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, &matmulTilingData,
tilingSize, ACL_MEMCPY_HOST_TO_DEVICE);
} else {
// Handle the case where batchIdx is out of range
TORCH_CHECK(false, "batchIdx is out of range: ", batchIdx);
}
at::Tensor tiling_tensor =
at::from_blob(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * batchIdx), tilingSize, at::kByte);
return std::make_tuple(tiling_tensor, block_dim);
}
}

View File

@@ -0,0 +1,57 @@
// Licensed under the BSD 3-Clause License (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef UTILS_COMMON_H
#define UTILS_COMMON_H
namespace host_utils {
constexpr uint32_t BLK_SIZE_ALIN_FOR_INT64 = 4;
constexpr uint32_t BLK_SIZE_ALIN_FOR_INT32 = 8;
inline uint64_t alinInt64Count(uint64_t count)
{
return (count + BLK_SIZE_ALIN_FOR_INT64 - 1) / BLK_SIZE_ALIN_FOR_INT64 * BLK_SIZE_ALIN_FOR_INT64;
}
inline uint64_t alinInt32Count(uint64_t count)
{
return (count + BLK_SIZE_ALIN_FOR_INT32 - 1) / BLK_SIZE_ALIN_FOR_INT32 * BLK_SIZE_ALIN_FOR_INT32;
}
template <typename T>
inline T CeilDiv(const T dividend, const T divisor)
{
if (divisor == 0) {
return UINT32_MAX;
}
return (dividend + divisor - 1) / divisor;
}
template <typename T>
inline T RoundUp(const T val, const T align = 16)
{
if (align == 0 || val + align - 1 < val) {
return 0;
}
return (val + align - 1) / align * align;
}
template <typename T>
inline T RoundDown(const T val, const T align = 16)
{
if (align == 0) {
return 0;
}
return val / align * align;
}
} // namespace host_utils
#endif // UTILS_COMMON_H

View File

@@ -0,0 +1,239 @@
/*
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#ifndef COMMMON_TILING_H
#define COMMMON_TILING_H
#include <iostream>
#include <cmath>
#include "common.h"
#include "tiling/platform/platform_ascendc.h"
namespace host_utils {
constexpr uint32_t FP16_SIZE = 2;
constexpr uint32_t FP32_SIZE = 4;
constexpr uint32_t BLOCK_SIZE = 16;
constexpr uint32_t BLOCK_SIZE_INT8_K = 32;
constexpr uint32_t BASE_BLOCK_STEP = 2;
constexpr uint32_t AXES_ALIGN_SIZE = 512;
constexpr uint32_t AXES_ALIGN_SIZE_INT8 = 256;
constexpr uint32_t ND_SHAPE_SIZE = 2;
constexpr uint32_t NZ_SHAPE_SIZE = 4;
constexpr uint32_t CUBE_BLOCK_SIZE = 256;
constexpr uint32_t CUBE_BLOCK_SIZE_INT8 = 512;
constexpr uint32_t L1AB_PINGPONG_BUFFER_LEN = 262144;
constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 131072 * 2; // 256 KB
constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 131072; // 128 KB
constexpr uint32_t L1AB_PINGPONG_BUFFER_LEN_INT8_SPARSE = 160 * 1024;
constexpr uint32_t UB_LIMIT_SIZE_910A = 128 * 1024;
enum class PlatformType { ASCEND_310P, ASCEND_910A, ASCEND_910B, ASCEND_910C, PLATFORM_INVALID };
struct PlatformInfo {
public:
static const PlatformInfo &Instance()
{
static PlatformInfo platformInfo;
return platformInfo;
}
PlatformType socType;
uint32_t coreNum;
uint32_t coreNumAic;
uint32_t coreNumAiv;
uint64_t ubSize;
uint64_t l1Size;
uint64_t l2Size;
uint64_t l0aSize;
uint64_t l0bSize;
uint64_t l0cSize;
private:
PlatformInfo()
{
auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
// TODO Hard coding set to 910_93xx, parse using aclrtGetSocName is better
socType = PlatformType::ASCEND_910C;
coreNum = ascendcPlatform->GetCoreNum();
coreNumAic = ascendcPlatform->GetCoreNumAic();
coreNumAiv = ascendcPlatform->GetCoreNumAiv();
ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1Size);
ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L2, l2Size);
ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0aSize);
ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0bSize);
ascendcPlatform->GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0cSize);
}
PlatformInfo(const PlatformInfo &) = delete;
PlatformInfo &operator=(const PlatformInfo &) = delete;
PlatformInfo(PlatformInfo &&) = delete;
PlatformInfo &operator=(PlatformInfo &&) = delete;
};
inline __attribute__((always_inline)) uint32_t GetN0TilingLimit(bool compressFlag, uint32_t tilingN,
const PlatformType &platformType)
{
if (compressFlag) {
return std::min(tilingN * BLOCK_SIZE, AXES_ALIGN_SIZE_INT8);
} else {
return (platformType == PlatformType::ASCEND_310P || platformType == PlatformType::ASCEND_910A)
? AXES_ALIGN_SIZE
: AXES_ALIGN_SIZE_INT8;
}
}
template <typename OpShareType>
inline __attribute__((always_inline)) uint32_t GetN0TilingInit(const OpShareType &opShape, bool compressFlag,
uint32_t tilingN)
{
const uint32_t rnd = 16;
return compressFlag
? ((tilingN * BLOCK_SIZE > opShape.n) ? RoundUp<uint32_t>(opShape.n, rnd) : tilingN * BLOCK_SIZE)
: BLOCK_SIZE;
}
template <bool PRI_FLAG>
inline __attribute__((always_inline)) bool IsExceedTilingLimit(uint32_t axes0, uint32_t priAxes0,
uint32_t n0TilingLimit, PlatformType platformType,
uint32_t basicBlockSize)
{
return (PRI_FLAG && axes0 > n0TilingLimit) || (!PRI_FLAG && priAxes0 > n0TilingLimit) ||
(platformType == PlatformType::ASCEND_910A && basicBlockSize > UB_LIMIT_SIZE_910A);
}
template <bool PRI_FLAG, typename OpShareType>
inline __attribute__((always_inline)) void SetOpShapeAxesInfo(OpShareType &opShape, uint32_t priAxes0, uint32_t axes0)
{
opShape.m0 = PRI_FLAG ? priAxes0 : axes0;
opShape.n0 = PRI_FLAG ? axes0 : priAxes0;
}
template <typename HardwareType, typename OpShapeType>
inline __attribute__((always_inline)) float CostFunc(const HardwareType &hwInfor, OpShapeType &shape)
{
float aCoef = 1;
float bCoef = 1;
float bwCoef = static_cast<float>(hwInfor.l2BandWidth) / static_cast<float>(hwInfor.hbmBandWidth);
uint32_t mLoop = CeilDiv(shape.m, shape.m0);
uint32_t nLoop = CeilDiv(shape.n, shape.n0);
if (mLoop == 0 || nLoop == 0) {
return 1;
}
uint32_t coreNeed = shape.batchSize * mLoop * nLoop;
uint32_t blockDim = std::min(coreNeed, hwInfor.coreNum);
uint32_t mOnce = blockDim < nLoop ? shape.m0 : blockDim / nLoop * shape.m0;
uint32_t nOnce = blockDim < nLoop ? hwInfor.coreNum * shape.n0 : shape.n;
if (mOnce * shape.k * FP16_SIZE > hwInfor.l2Size) {
aCoef = bwCoef;
}
if (nOnce * shape.k * FP16_SIZE > hwInfor.l2Size) {
bCoef = bwCoef;
}
return 1 / (aCoef * static_cast<float>(shape.n0)) + 1 / (bCoef * static_cast<float>(shape.m0));
}
template <bool PRI_FLAG, typename OpShareType, typename TilingType, typename HardwareType, typename MatMulInfoType>
void TilingFunc(OpShareType &opShape, TilingType &tilingParam, const HardwareType &hwInfor,
const MatMulInfoType &mmInfo, bool compressFlag = false, const uint32_t tilingN = 1)
{
float costMin = 1;
const float CONST_2 = 2.0;
const uint32_t ROUND_CONST_16 = 16;
uint32_t roundBase = static_cast<uint32_t>(
pow(2, ceil(log(CeilDiv(PRI_FLAG ? opShape.n : opShape.m, ROUND_CONST_16)))) * ROUND_CONST_16);
uint32_t priAxes = RoundUp<uint32_t>(PRI_FLAG ? opShape.m : opShape.n, ROUND_CONST_16);
uint32_t axes = RoundUp<uint32_t>(PRI_FLAG ? opShape.n : opShape.m, roundBase);
float axes0Max = static_cast<float>(AXES_ALIGN_SIZE) / mmInfo.inDtype;
auto platformType = PlatformInfo::Instance().socType;
if (mmInfo.isInt8 && (platformType == PlatformType::ASCEND_310P || platformType == PlatformType::ASCEND_910A)) {
axes0Max /= CONST_2;
}
uint32_t n0TilingInit = GetN0TilingInit(opShape, compressFlag, tilingN);
uint32_t n0TilingLimit = GetN0TilingLimit(compressFlag, tilingN, platformType);
uint32_t priAxes0Init = PRI_FLAG ? BLOCK_SIZE : n0TilingInit;
uint32_t axes0Init = PRI_FLAG ? n0TilingInit : BLOCK_SIZE;
for (uint32_t priAxes0 = priAxes0Init; priAxes0 <= priAxes && priAxes0 <= axes0Max; priAxes0 *= BASE_BLOCK_STEP) {
for (uint32_t axes0 = axes0Init; axes0 <= axes && axes0 <= axes0Max; axes0 *= BASE_BLOCK_STEP) {
uint32_t basicBlockSize = priAxes0 * axes0 * FP32_SIZE;
if (basicBlockSize > hwInfor.l0cSize) {
continue;
}
if (mmInfo.isInt8 &&
IsExceedTilingLimit<PRI_FLAG>(axes0, priAxes0, n0TilingLimit, platformType, basicBlockSize)) {
continue;
}
SetOpShapeAxesInfo<PRI_FLAG>(opShape, priAxes0, axes0);
float cost = CostFunc<HardwareType, OpShareType>(hwInfor, opShape);
if (cost >= costMin) {
continue;
}
costMin = cost;
if constexpr (std::is_same<TilingType, pp_matmul::PpMatmulTilingData>::value) {
tilingParam.SetBaseOp(hwInfor.coreNum, opShape.m0, opShape.n0, mmInfo);
} else {
tilingParam.SetBaseOp(hwInfor.coreNum, opShape.m0, opShape.n0);
}
}
}
}
template <typename PpTilingDataType>
uint32_t Swizzl(PpTilingDataType &tilingData)
{
uint32_t swizzlDirect = 0;
uint32_t swizzlCount = 1;
float m0 = tilingData.opShape.m0;
float n0 = tilingData.opShape.n0;
float m = tilingData.opShape.m;
float k = tilingData.opShape.k;
float n = tilingData.opShape.n;
float mincost = m * k + k * n;
for (uint32_t i = 1; i <= tilingData.blockDim; ++i) {
int c = static_cast<int32_t>((tilingData.blockDim + i - 1) / i);
float cost;
// B0 + A < A0 + B
if (i * n0 + m < m0 * c + n) {
swizzlDirect = 1; // Nz
cost = n0 * i + m0 * c;
if (cost <= mincost) {
mincost = cost;
swizzlCount = i;
}
} else {
swizzlDirect = 0; // Zn
cost = m0 * i + n0 * c;
if (cost < mincost) {
mincost = cost;
swizzlCount = i;
}
}
}
tilingData.swizzlDirect = swizzlDirect;
tilingData.swizzlCount = swizzlCount;
return swizzlDirect;
}
template <typename PpTilingDataType>
inline __attribute__((always_inline)) void PpMatmulTilingCheck(const PpTilingDataType &tilingData)
{
TORCH_CHECK(tilingData.opShape.m0 > 0, "m0 is invalid");
TORCH_CHECK(tilingData.opShape.k0 > 0, "k0 is invalid");
TORCH_CHECK(tilingData.opShape.n0 > 0, "n0 is invalid");
TORCH_CHECK(tilingData.mLoop > 0, "mLoop is invalid");
TORCH_CHECK(tilingData.kLoop > 0, "kLoop is invalid");
TORCH_CHECK(tilingData.nLoop > 0, "nLoop is invalid");
TORCH_CHECK(tilingData.blockDim > 0, "nLoop is invalid");
}
} // namespace host_utils
#endif

View File

@@ -0,0 +1,155 @@
#include <map>
#include "tiling_data.h"
#include "common.h"
#include "common_tiling.h"
namespace pp_matmul {
constexpr uint32_t L1_DESCALE_BUFFER_LEN_MAX = 6144;
constexpr uint32_t CONST_3 = 3;
constexpr uint32_t CONST_4 = 4;
constexpr uint32_t CONST_16 = 16;
constexpr uint32_t CONST_32 = 32;
constexpr uint32_t CONST_256 = 256;
constexpr uint32_t CONST_512 = 512;
const std::map<TensorDType, uint32_t> G_DTYPE_MAP = {{TensorDType::TENSOR_DTYPE_FLOAT16, 1u},
{TensorDType::TENSOR_DTYPE_BF16, 2u}};
const std::map<TensorFormat, uint32_t> G_FORMAT_MAP = {{TensorFormat::TENSOR_FORMAT_ND, 0u},
{TensorFormat::TENSOR_FORMAT_NZ, 1u}};
using MmType = MatMul::MatMulType;
using QmType = MatMul::QuantMode;
using namespace host_utils;
bool IsI8Bf16Kernel(const MatMulInfo &mmInfo)
{
bool isI8Bf16 = mmInfo.isInt8 && mmInfo.dtypeC == TensorDType::TENSOR_DTYPE_BF16;
bool isI8Fp16 = mmInfo.isInt8 && mmInfo.dtypeC == TensorDType::TENSOR_DTYPE_FLOAT16 &&
mmInfo.quantMode == QmType::PER_TOKEN_SYMM;
return isI8Bf16 || isI8Fp16;
}
HardwareInfo::HardwareInfo()
{
auto &platform = PlatformInfo::Instance();
coreNum = platform.coreNumAic;
l2Size = platform.l2Size;
l1Size = platform.l1Size;
l0aSize = platform.l0aSize;
l0bSize = platform.l0bSize;
l0cSize = platform.l0cSize;
hbmBandWidth = 1;
l2BandWidth = 5; // 5x faster than hbm.
}
void PpMatmulTilingData::SetBaseShape(uint32_t batchSize, uint32_t m, uint32_t k, uint32_t n)
{
opShape.batchSize = batchSize;
opShape.m = m;
opShape.k = k;
opShape.n = n;
}
void PpMatmulTilingData::SetBaseOp(uint32_t coreNum, uint32_t mBase, uint32_t nBase, const MatMulInfo &mmInfo)
{
opShape.m0 = mBase;
opShape.n0 = nBase;
mLoop = CeilDiv(opShape.m, opShape.m0);
nLoop = CeilDiv(opShape.n, opShape.n0);
coreLoop = opShape.batchSize * mLoop * nLoop;
if (mLoop == 1 && mmInfo.transB && coreLoop % coreNum < coreNum / CONST_4 * CONST_3) {
mBase = RoundUp<uint32_t>(opShape.m, CONST_16);
opShape.m0 = mBase;
uint32_t maxN0 = PlatformInfo::Instance().l0cSize / (mBase * sizeof(float));
if (mmInfo.isInt8 || mmInfo.mmType == MmType::MATMUL_WITH_BIAS) {
maxN0 = maxN0 < CONST_256 ? maxN0 : CONST_256;
}
uint32_t x = CeilDiv(opShape.n, coreNum);
uint32_t y = CeilDiv(x, maxN0);
nBase = RoundUp<uint32_t>(CeilDiv(x, y), CONST_16);
uint32_t rqdL0CSize = mBase * nBase * sizeof(float);
if (rqdL0CSize < PlatformInfo::Instance().l0cSize &&
(mBase + nBase) * CONST_256 * sizeof(uint16_t) < L1AB_PINGPONG_BUFFER_LEN) {
opShape.n0 = nBase;
nLoop = CeilDiv(opShape.n, opShape.n0);
coreLoop = opShape.batchSize * nLoop;
}
}
blockDim = std::min(coreLoop, coreNum);
}
// transA transB quantMode [dtype] format
void PpMatmulTilingData::SetTilingKey(const MatMulInfo &mmInfo, uint32_t swizzleDirect, uint32_t enSplitK)
{
if (mmInfo.mmType == MmType::MATMUL_ACCUM_ATOMIC || mmInfo.mmType == MmType::MATMUL_WITH_BIAS ||
mmInfo.mmType == MmType::MATMUL_EIN_SUM || mmInfo.mmType == MmType::MATMUL_DEQUANT || IsI8Bf16Kernel(mmInfo)) {
// SwizzleDir[1] TransA[1] TransB[1] DtypeA[3] DtypeB[3] DtypeC[3] FormatA[1] FormatB[1] FormatC[1] WithBias[1]
tilingKey = swizzleDirect;
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transA);
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transB);
tilingKey = (tilingKey << 3) + G_DTYPE_MAP.at(mmInfo.dtypeA); // 3bit for dtypeA.
tilingKey = (tilingKey << 3) + G_DTYPE_MAP.at(mmInfo.dtypeB); // 3bit for dtypeB.
tilingKey = (tilingKey << 3) + G_DTYPE_MAP.at(mmInfo.dtypeC); // 3bit for dtypeC.
tilingKey = (tilingKey << 1) + G_FORMAT_MAP.at(mmInfo.formatA);
tilingKey = (tilingKey << 1) + G_FORMAT_MAP.at(mmInfo.formatB);
tilingKey = (tilingKey << 1) + G_FORMAT_MAP.at(mmInfo.formatC);
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.biasFlag);
} else {
tilingKey = swizzleDirect;
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transA);
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.transB);
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.isInt8);
tilingKey = (tilingKey << 1) + static_cast<uint32_t>(mmInfo.biasFlag);
tilingKey = (tilingKey << 1) + enSplitK;
}
}
uint32_t PpMatmulTilingData::End(const MatMulInfo &mmInfo)
{
uint32_t cubeBlockSize = mmInfo.isInt8 ? CUBE_BLOCK_SIZE_INT8 : CUBE_BLOCK_SIZE;
uint32_t kBlockSize = mmInfo.isInt8 ? BLOCK_SIZE_INT8_K : BLOCK_SIZE;
uint32_t scaleBlockSize = mmInfo.isInt8 ? L1_DESCALE_BUFFER_LEN_MAX : 0;
uint32_t shapeSum = opShape.m0 + opShape.n0;
if (mmInfo.isInt8 && (mmInfo.transA || !mmInfo.transB)) {
shapeSum = RoundUp<uint32_t>(opShape.m0, CONST_32) + RoundUp<uint32_t>(opShape.n0, CONST_32);
}
uint32_t k0Max = shapeSum == 0
? L1AB_PINGPONG_BUFFER_LEN
: static_cast<uint32_t>(static_cast<float>(L1AB_PINGPONG_BUFFER_LEN - scaleBlockSize) /
(shapeSum * mmInfo.inDtype));
if (mmInfo.mmType == MatMul::MatMulType::MATMUL_WITH_BIAS) {
uint32_t l1AbSize = L1AB_PINGPONG_BUFFER_LEN - opShape.n0 * sizeof(float);
k0Max = l1AbSize / (shapeSum * mmInfo.inDtype);
}
opShape.k0 =
k0Max < cubeBlockSize ? RoundDown<uint32_t>(k0Max, kBlockSize) : RoundDown<uint32_t>(k0Max, cubeBlockSize);
if (opShape.k0 > CONST_512) {
opShape.k0 = RoundDown<uint32_t>(opShape.k0, CONST_512);
}
kLoop = CeilDiv(opShape.k, opShape.k0);
return blockDim;
}
void GetPpMatmulTiling(const MatMulInfo &mmInfo, const HardwareInfo &hwInfo, uint32_t &blockDim,
PpMatmulTilingData &tilingData)
{
OpShape opShape;
opShape.batchSize = mmInfo.batchSize;
opShape.m = mmInfo.m;
opShape.n = mmInfo.n;
opShape.k = mmInfo.k;
tilingData.opShape = opShape;
tilingData.quantMode = static_cast<uint32_t>(mmInfo.quantMode);
tilingData.SetTilingKey(mmInfo, 0, 0); // init tilingkey with transA transB.
if (opShape.m < opShape.n) {
TilingFunc<false, OpShape, PpMatmulTilingData, HardwareInfo, MatMulInfo>(opShape, tilingData, hwInfo, mmInfo);
} else {
TilingFunc<true, OpShape, PpMatmulTilingData, HardwareInfo, MatMulInfo>(opShape, tilingData, hwInfo, mmInfo);
}
uint32_t direct = Swizzl<PpMatmulTilingData>(tilingData);
blockDim = tilingData.End(mmInfo);
tilingData.SetTilingKey(mmInfo, direct, 0);
}
} // namespace pp_matmul

View File

@@ -0,0 +1,90 @@
#ifndef PP_MATMUL_TILING_DATA
#define PP_MATMUL_TILING_DATA
#include <cstdint>
namespace pp_matmul {
struct MatMul {
enum class MatMulType : uint32_t {
MATMUL_DEFAULT = 0, // C = op(A) * op(B)
MATMUL_DEQUANT, //
MATMUL_ACCUM_ATOMIC, // C += op(A) * op(B)
MATMUL_WITH_BIAS, // C = op(A) * op(B) + Bias, where Bias is a vector.
MATMUL_EIN_SUM
};
enum class QuantMode : uint32_t { PER_CHANNEL_SYMM = 0, PER_CHANNEL_ASYMM, PER_TOKEN_SYMM };
};
enum class TensorDType : uint32_t { TENSOR_DTYPE_FLOAT16 = 0, TENSOR_DTYPE_BF16 };
enum class TensorFormat : uint32_t { TENSOR_FORMAT_ND = 0, TENSOR_FORMAT_NZ };
struct MatMulInfo {
uint32_t batchSize{0};
uint32_t m{0}; // actual input m
uint32_t k{0}; // actual input k
uint32_t n{0}; // actual input n
TensorDType dtypeA{TensorDType::TENSOR_DTYPE_FLOAT16};
TensorDType dtypeB{TensorDType::TENSOR_DTYPE_FLOAT16};
TensorDType dtypeC{TensorDType::TENSOR_DTYPE_FLOAT16};
TensorFormat formatA{TensorFormat::TENSOR_FORMAT_ND};
TensorFormat formatB{TensorFormat::TENSOR_FORMAT_ND};
TensorFormat formatC{TensorFormat::TENSOR_FORMAT_ND};
MatMul::MatMulType mmType{MatMul::MatMulType::MATMUL_DEFAULT};
bool transA{0}; // false: 0, true: 1
bool transB{0}; // false: 0, true: 1
bool biasFlag{0}; // false: 0, true: 1
bool isInt8{0}; // false: 0, true: 1
float inDtype{0};
float outDtype{0};
MatMul::QuantMode quantMode{MatMul::QuantMode::PER_CHANNEL_SYMM};
};
struct OpShape {
uint32_t batchSize{0};
uint32_t m{0};
uint32_t k{0};
uint32_t n{0};
uint32_t m0{0};
uint32_t k0{0};
uint32_t n0{0};
};
struct HardwareInfo {
uint32_t coreNum{0};
uint32_t l2Size{0};
uint32_t l1Size{0};
uint32_t l0aSize{0};
uint32_t l0bSize{0};
uint32_t l0cSize{0};
uint32_t hbmBandWidth{0};
uint32_t l2BandWidth{0};
HardwareInfo();
};
#pragma pack(push, 1)
struct PpMatmulTilingData {
OpShape opShape{};
uint32_t mLoop{1};
uint32_t kLoop{1};
uint32_t nLoop{1};
uint32_t coreLoop{1};
uint32_t swizzlCount{1};
uint32_t tilingKey{0};
uint32_t blockDim{1};
uint32_t swizzlDirect{0};
uint32_t splitk{0};
uint32_t enShuffleK{0};
uint32_t quantMode{0};
void SetBaseShape(uint32_t batchSize, uint32_t m, uint32_t k, uint32_t n);
void SetBaseOp(uint32_t coreNum, uint32_t mBase, uint32_t nBase, const MatMulInfo &mmInfo);
void SetTilingKey(const MatMulInfo &mmInfo, uint32_t swizzleDirect, uint32_t enSplitK);
uint32_t End(const MatMulInfo &mmInfo);
};
#pragma pack(pop)
void GetPpMatmulTiling(const MatMulInfo &mmInfo, const HardwareInfo &hwInfo, uint32_t &blockDim,
PpMatmulTilingData &tilingData);
} // namespace pp_matmul
#endif

View File

@@ -0,0 +1,825 @@
// Adapted from
// https://gitee.com/ascend/ascend-transformer-boost
//
// Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
// This file is a part of the CANN Open Software.
// Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
// Please refer to the License for details. You may not use this file except in compliance with the License.
// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
// See LICENSE in the root of the software repository for the full text of the License.
//
#define __aicore__ [aicore]
#include "kernel_operator.h"
#include "../op_host/tiling/tiling_data.h"
#include "../../mla_preprocess/op_kernel/kernel/common.h"
#include "../../mla_preprocess/op_kernel/kernel/hardware.h"
#include "../../mla_preprocess/op_kernel/kernel/mma.h"
#include "../../mla_preprocess/op_kernel/kernel/utils.h"
#include "../../mla_preprocess/op_kernel/kernel/iterator.h"
#include "../../kernels/math_utils.h"
constexpr uint32_t L0_PINGPONG_BUFFER_LEN = 16384;
constexpr uint32_t L1_PINGPONG_BUFFER_LEN = 131072;
constexpr uint32_t CONST_16 = 16;
constexpr uint32_t CONST_256 = 256;
constexpr uint64_t ND2NZ_STRIDE_LIMIT = 65536;
constexpr uint64_t BLOCK_SIZE_16 = 16;
constexpr uint64_t CONST_16UL = 16;
constexpr uint64_t CONST_256UL = 256;
struct MatCoord {
uint64_t m{0};
uint64_t k{0};
uint64_t n{0};
};
using namespace device_utils;
template <uint32_t SwizzleDirect, bool TA, bool TB, typename InDtype = half, typename OutDtype = half,
DataFormat FormatB = DataFormat::ND>
class PpMatmulEinSum
{
using LocalTensor = AscendC::LocalTensor<InDtype>;
template <DataFormat srcFormat = DataFormat::ND, DataFormat dstFormat = DataFormat::ND>
using CopyGmToCbuf = gm_to_l1<ArchType::ASCEND_V220, InDtype, srcFormat, dstFormat>;
using LoadCbufToCa = l1_to_l0_a<ArchType::ASCEND_V220, InDtype, TA, DataFormat::ZN, DataFormat::ZZ>;
using LoadCbufToCb = l1_to_l0_b<ArchType::ASCEND_V220, InDtype, TB, DataFormat::ZN, DataFormat::NZ>;
using Mad = mmad<ArchType::ASCEND_V220, InDtype, InDtype, float, TA>;
using CopyCcToGm = l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, OutDtype, float>;
public:
__aicore__ explicit PpMatmulEinSum(){};
__aicore__ __force_inline__ void Init(__gm__ uint8_t *__restrict__ a, __gm__ uint8_t *__restrict__ b,
__gm__ uint8_t *__restrict__ c, __gm__ uint8_t *__restrict__ tiling_data)
{
gm_a.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(a));
gm_b.SetGlobalBuffer(reinterpret_cast<__gm__ InDtype *>(b));
gm_c.SetGlobalBuffer(reinterpret_cast<__gm__ OutDtype *>(c));
auto gm_tiling_data = reinterpret_cast<__gm__ pp_matmul::PpMatmulTilingData *>(tiling_data);
batch_size = gm_tiling_data->opShape.batchSize;
m = gm_tiling_data->opShape.m;
k = gm_tiling_data->opShape.k;
n = gm_tiling_data->opShape.n;
m0 = gm_tiling_data->opShape.m0;
k0 = gm_tiling_data->opShape.k0;
n0 = gm_tiling_data->opShape.n0;
tdim.m = gm_tiling_data->mLoop;
tdim.k = gm_tiling_data->kLoop;
tdim.n = gm_tiling_data->nLoop;
core_loop = gm_tiling_data->coreLoop;
swizzle_cnt = gm_tiling_data->swizzlCount;
en_shuffle_k = gm_tiling_data->enShuffleK;
AsdopsBuffer<ArchType::ASCEND_V220> buf;
l1_base_a = buf.template GetBuffer<BufferType::ASCEND_CB, InDtype>(0);
l1_base_b = buf.template GetBuffer<BufferType::ASCEND_CB, InDtype>(
RoundUp<uint64_t>(m0 * k0 * sizeof(InDtype), CONST_256UL));
l0a_base = buf.template GetBuffer<BufferType::ASCEND_L0A, InDtype>(0);
l0b_base = buf.template GetBuffer<BufferType::ASCEND_L0B, InDtype>(0);
num_core = AscendC::GetBlockNum();
core_idx = AscendC::GetBlockIdx();
ping_flag = 1;
}
__aicore__ __force_inline__ void GetBlockIdx(uint64_t index, MatCoord &tidx)
{
uint64_t in_batch_idx = index % (tdim.m * tdim.n);
if constexpr (SwizzleDirect == 0) { // Zn
uint64_t tile_block_loop = (tdim.m + swizzle_cnt - 1) / swizzle_cnt;
uint64_t tile_block_idx = in_batch_idx / (swizzle_cnt * tdim.n);
uint64_t in_tile_block_idx = in_batch_idx % (swizzle_cnt * tdim.n);
uint64_t n_row = swizzle_cnt;
if (tile_block_idx == tile_block_loop - 1) {
n_row = tdim.m - swizzle_cnt * tile_block_idx;
}
tidx.m = tile_block_idx * swizzle_cnt + in_tile_block_idx % n_row;
tidx.n = in_tile_block_idx / n_row;
if (tile_block_idx % 2 != 0) {
tidx.n = tdim.n - tidx.n - 1;
}
} else if constexpr (SwizzleDirect == 1) { // Nz
uint64_t tile_block_loop = (tdim.n + swizzle_cnt - 1) / swizzle_cnt;
uint64_t tile_block_idx = in_batch_idx / (swizzle_cnt * tdim.m);
uint64_t in_tile_block_idx = in_batch_idx % (swizzle_cnt * tdim.m);
uint64_t n_col = swizzle_cnt;
if (tile_block_idx == tile_block_loop - 1) {
n_col = tdim.n - swizzle_cnt * tile_block_idx;
}
tidx.m = in_tile_block_idx / n_col;
tidx.n = tile_block_idx * swizzle_cnt + in_tile_block_idx % n_col;
if (tile_block_idx % 2 != 0) {
tidx.m = tdim.m - tidx.m - 1;
}
}
}
__aicore__ __force_inline__ void Process()
{
set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0);
set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2);
set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3);
set_flag(PIPE_FIX, PIPE_M, EVENT_ID0);
set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0);
set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
for (uint64_t loop_idx = core_idx; loop_idx < core_loop; loop_idx += num_core) {
uint64_t batch_idx = loop_idx / tdim.n / tdim.m;
MatCoord tidx{0};
GetBlockIdx(loop_idx, tidx);
uint64_t offset_a = 0, offset_b = 0, offset_a_next = 0, offset_b_next = 0;
uint64_t offset_c = tidx.m * m0 * batch_size * n + batch_idx * n + tidx.n * n0;
uint64_t m_actual = (tidx.m == (tdim.m - 1)) ? (m - tidx.m * m0) : m0;
uint64_t n_actual = (tidx.n == (tdim.n - 1)) ? (n - tidx.n * n0) : n0;
uint64_t m_round = RoundUp<uint64_t, CONST_16UL>(m_actual);
uint64_t n_round = RoundUp<uint64_t, CONST_16UL>(n_actual);
uint64_t mn_max = m_round > n_round ? m_round : n_round;
uint64_t k_part_len = L0_PINGPONG_BUFFER_LEN / mn_max / CONST_16 * CONST_16;
uint64_t shuffle_k = en_shuffle_k ? (core_idx % tdim.k) : 0;
if (TA) {
offset_a = shuffle_k * k0 * m * batch_size + batch_idx * m + tidx.m * m0;
} else {
offset_a = tidx.m * m0 * batch_size * k + batch_idx * k + shuffle_k * k0;
}
if (TB) {
if constexpr (FormatB != DataFormat::NZ) {
offset_b = batch_idx * k * n + tidx.n * n0 * k + shuffle_k * k0;
} else {
offset_b = batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
tidx.n * n0 * BLOCK_SIZE_16 + shuffle_k * k0 * RoundUp<uint64_t, CONST_16UL>(n);
}
} else {
if constexpr (FormatB != DataFormat::NZ) {
offset_b = batch_idx * k * n + shuffle_k * k0 * n + tidx.n * n0;
} else {
offset_b = batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
shuffle_k * k0 * BLOCK_SIZE_16 + tidx.n * n0 * RoundUp<uint64_t, CONST_16UL>(k);
}
}
uint64_t k_actual = (shuffle_k == tdim.k - 1) ? k - shuffle_k * k0 : k0;
uint64_t k_round = (k_actual + CONST_16 - 1) / CONST_16 * CONST_16;
LocalTensor l1_buf_a = ping_flag ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
LocalTensor l1_buf_b = ping_flag ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
LocalTensor l0a_buf = ping_flag ? l0a_base : l0a_base[L0_PINGPONG_BUFFER_LEN];
LocalTensor l0b_buf = ping_flag ? l0b_base : l0b_base[L0_PINGPONG_BUFFER_LEN];
event_t event_id = ping_flag ? EVENT_ID0 : EVENT_ID1;
if (loop_idx == core_idx) {
WAIT_FLAG(MTE1, MTE2, event_id);
// *** load matrix A to L1
if ((m == 1) || (m_actual == 1 && !TA)) {
CopyGmToCbuf<DataFormat::ND, DataFormat::ND>(l1_buf_a, // dst
gm_a[offset_a], // src
1, // nTileActual
16, // nTileCeil
1, // nVal
k_actual, // kTileActual
k_round, // kTileCeil
k); // dVal
} else {
if (TA) {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a, // dst
gm_a[offset_a], // src
k_actual, // nTileActual
k_round, // nTileCeil
k, // nVal
m_actual, // dTileActual
m_round, // dTileCeil
m * batch_size); // dVal
} else {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a, // dst
gm_a[offset_a], // src
m_actual, // nTileActual
m_round, // nTileCeil
m, // nVal
k_actual, // dTileActual
k_round, // dTileCeil
k * batch_size); // dVal
}
}
SET_FLAG(MTE2, MTE1, event_id);
// *** load matrix B to L1
wait_flag(PIPE_MTE1, PIPE_MTE2, event_id + 2);
if constexpr (FormatB != DataFormat::NZ) {
if (TB) {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b, // dst
gm_b[offset_b], // src
n_actual, // nTileActual
n_round, // nTileCeil
n, // nVal
k_actual, // dTileActual
k_round, // dTileCeil
k); // dVal
} else {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b, // dst
gm_b[offset_b], // src
k_actual, // nTileActual
k_round, // nTileCeil
k, // nVal
n_actual, // dTileActual
n_round, // dTileCeil
n); // dVal
}
} else {
if (TB) {
CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b, // dst
gm_b[offset_b], // src
n_actual, // nTileActual
n_round, // nTileCeil
RoundUp<uint64_t, CONST_16UL>(n), // nVal
k_actual, // dTileActual
k_round, // dTileCeil
RoundUp<uint64_t, CONST_16UL>(k)); // dVal
} else {
CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b, // dst
gm_b[offset_b], // src
k_actual, // nTileActual
k_round, // nTileCeil
RoundUp<uint64_t, CONST_16UL>(k), // nVal
n_actual, // dTileActual
n_round, // dTileCeil
RoundUp<uint64_t, CONST_16UL>(n)); // dVal
}
}
SET_FLAG(MTE2, MTE1, event_id + 2);
}
for (tidx.k = 0; tidx.k < tdim.k; ++tidx.k) {
shuffle_k = en_shuffle_k ? (tidx.k + core_idx) % tdim.k : tidx.k;
uint64_t k_actual = (shuffle_k == (tdim.k - 1)) ? (k - shuffle_k * k0) : k0;
uint64_t k_round = (k_actual + CONST_16 - 1) / CONST_16 * CONST_16;
fdim.k = (k_actual + k_part_len - 1) / k_part_len;
LocalTensor l1_buf_a = ping_flag ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
LocalTensor l1_buf_b = ping_flag ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
auto event_id = ping_flag ? EVENT_ID0 : EVENT_ID1;
if (tidx.k < tdim.k - 1) {
uint64_t shuffle_k_next = en_shuffle_k ? (core_idx + tidx.k + 1) % tdim.k : (tidx.k + 1);
if (TA) {
offset_a_next = shuffle_k_next * k0 * m * batch_size + batch_idx * m + tidx.m * m0;
} else {
offset_a_next = tidx.m * m0 * batch_size * k + batch_idx * k + shuffle_k_next * k0;
}
if (TB) {
if constexpr (FormatB != DataFormat::NZ) {
offset_b_next = batch_idx * k * n + tidx.n * n0 * k + shuffle_k_next * k0;
} else {
offset_b_next =
batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
tidx.n * n0 * BLOCK_SIZE_16 + shuffle_k_next * k0 * RoundUp<uint64_t, CONST_16UL>(n);
}
} else {
if constexpr (FormatB != DataFormat::NZ) {
offset_b_next = batch_idx * k * n + shuffle_k_next * k0 * n + tidx.n * n0;
} else {
offset_b_next =
batch_idx * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
shuffle_k_next * k0 * BLOCK_SIZE_16 + tidx.n * n0 * RoundUp<uint64_t, CONST_16UL>(k);
}
}
uint64_t k_actual_next = (shuffle_k_next == (tdim.k - 1)) ? (k - shuffle_k_next * k0) : k0;
uint64_t k_round_next = (k_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
LocalTensor l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
LocalTensor l1_buf_b_next = (1 - ping_flag) ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
event_t event_id_next = (1 - ping_flag) ? EVENT_ID0 : EVENT_ID1;
WAIT_FLAG(MTE1, MTE2, event_id_next);
// *** load matrix A to L1
if ((m == 1) || (m_actual == 1 && !TA)) {
CopyGmToCbuf<DataFormat::ND, DataFormat::ND>(l1_buf_a_next, // dst
gm_a[offset_a_next], // src
m_actual, // nTileActual
m_round, // nTileCeil
m, // nVal
k_actual_next, // kTileActual
k_round_next, // kTileCeil
k); // dVal
} else {
if (TA) {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next, // dst
gm_a[offset_a_next], // src
k_actual_next, // nTileActual
k_round_next, // nTileCeil
k, // nVal
m_actual, // dTileActual
m_round, // dTileCeil
m * batch_size); // dVal
} else {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next, // dst
gm_a[offset_a_next], // src
m_actual, // nTileActual
m_round, // nTileCeil
m, // nVal
k_actual_next, // dTileActual
k_round_next, // dTileCeil
k * batch_size); // dVal
}
}
SET_FLAG(MTE2, MTE1, event_id_next);
// *** load matrix B to L1
wait_flag(PIPE_MTE1, PIPE_MTE2, event_id_next + 2);
if constexpr (FormatB != DataFormat::NZ) {
if (TB) {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
n_actual, // nTileActual
n_round, // nTileCeil
n, // nVal
k_actual_next, // dTileActual
k_round_next, // dTileCeil
k); // dVal
} else {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
k_actual_next, // nTileActual
k_round_next, // nTileCeil
k, // nVal
n_actual, // dTileActual
n_round, // dTileCeil
n); // dVal
}
} else {
if (TB) {
CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
n_actual, // nTileActual
n_round, // nTileCeil
RoundUp<uint64_t, CONST_16UL>(n), // nVal
k_actual_next, // dTileActual
k_round_next, // dTileCeil
RoundUp<uint64_t, CONST_16UL>(k)); // dVal
} else {
CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
k_actual_next, // nTileActual
k_round_next, // nTileCeil
RoundUp<uint64_t, CONST_16UL>(k), // nVal
n_actual, // dTileActual
n_round, // dTileCeil
RoundUp<uint64_t, CONST_16UL>(n)); // dVal
}
}
SET_FLAG(MTE2, MTE1, event_id_next + 2);
}
if (tidx.k == tdim.k - 1 && loop_idx + num_core < core_loop) {
uint64_t b_idx_next = (loop_idx + num_core) / tdim.n / tdim.m;
MatCoord tidx{0};
GetBlockIdx(loop_idx + num_core, tidx);
uint64_t shuffle_k_next = en_shuffle_k ? (core_idx % tdim.k) : 0;
uint64_t m_actual_next = (tidx.m == (tdim.m - 1)) ? (m - tidx.m * m0) : m0;
uint64_t n_actual_next = (tidx.n == (tdim.n - 1)) ? (n - tidx.n * n0) : n0;
uint64_t m_round_next = (m_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
uint64_t n_round_next = (n_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
uint64_t k_actual_next = (shuffle_k_next == (tdim.k - 1)) ? (k - shuffle_k_next * k0) : k0;
uint64_t k_round_next = (k_actual_next + CONST_16 - 1) / CONST_16 * CONST_16;
if (TA) {
offset_a_next = shuffle_k_next * k0 * m * batch_size + b_idx_next * m + tidx.m * m0;
} else {
offset_a_next = tidx.m * m0 * batch_size * k + b_idx_next * k + shuffle_k_next * k0;
}
if (TB) {
if constexpr (FormatB != DataFormat::NZ) {
offset_b_next = b_idx_next * k * n + tidx.n * n0 * k + shuffle_k_next * k0;
} else {
offset_b_next =
b_idx_next * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
tidx.n * n0 * BLOCK_SIZE_16 + shuffle_k_next * k0 * RoundUp<uint64_t, CONST_16UL>(n);
}
} else {
if constexpr (FormatB != DataFormat::NZ) {
offset_b_next = b_idx_next * k * n + shuffle_k_next * k0 * n + tidx.n * n0;
} else {
offset_b_next =
b_idx_next * RoundUp<uint64_t, CONST_16UL>(k) * RoundUp<uint64_t, CONST_16UL>(n) +
shuffle_k_next * k0 * BLOCK_SIZE_16 + tidx.n * n0 * RoundUp<uint64_t, CONST_16UL>(k);
}
}
LocalTensor l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a[L1_PINGPONG_BUFFER_LEN];
LocalTensor l1_buf_b_next = (1 - ping_flag) ? l1_base_b : l1_base_b[L1_PINGPONG_BUFFER_LEN];
event_t event_id_next = (1 - ping_flag) ? EVENT_ID0 : EVENT_ID1;
WAIT_FLAG(MTE1, MTE2, event_id_next);
// *** load matrix A to L1
if (m == 1 || m_actual_next == 1 && !TA) {
CopyGmToCbuf<DataFormat::ND, DataFormat::ND>(l1_buf_a_next, // dst
gm_a[offset_a_next], // src
m_actual_next, // nTileActual
m_round_next, // nTileCeil
m, // nVal
k_actual_next, // kTileActual
k_round_next, // kTileCeil
k); // dVal
} else {
if (TA) {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next, // dst
gm_a[offset_a_next], // src
k_actual_next, // nTileActual
k_round_next, // nTileCeil
k, // nVal
m_actual_next, // dTileActual
m_round_next, // dTileCeil
m * batch_size); // dVal
} else {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_a_next, // dst
gm_a[offset_a_next], // src
m_actual_next, // nTileActual
m_round_next, // nTileCeil
m, // nVal
k_actual_next, // dTileActual
k_round_next, // dTileCeil
k * batch_size); // dVal
}
}
SET_FLAG(MTE2, MTE1, event_id_next);
// *** load matrix B to L1
wait_flag(PIPE_MTE1, PIPE_MTE2, event_id_next + 2);
if constexpr (FormatB != DataFormat::NZ) {
if (TB) {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
n_actual_next, // nTileActual
n_round_next, // nTileCeil
n, // nVal
k_actual_next, // dTileActual
k_round_next, // dTileCeil
k); // dVal
} else {
CopyGmToCbuf<DataFormat::ND, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
k_actual_next, // nTileActual
k_round_next, // nTileCeil
k, // nVal
n_actual_next, // dTileActual
n_round_next, // dTileCeil
n); // dVal
}
} else {
if (TB) {
CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
n_actual_next, // nTileActual
n_round_next, // nTileCeil
RoundUp<uint64_t, CONST_16UL>(n), // nVal
k_actual_next, // dTileActual
k_round_next, // dTileCeil
RoundUp<uint64_t, CONST_16UL>(k)); // dVal
} else {
CopyGmToCbuf<DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next, // dst
gm_b[offset_b_next], // src
k_actual_next, // nTileActual
k_round_next, // nTileCeil
RoundUp<uint64_t, CONST_16UL>(k), // nVal
n_actual_next, // dTileActual
n_round_next, // dTileCeil
RoundUp<uint64_t, CONST_16UL>(n)); // dVal
}
}
SET_FLAG(MTE2, MTE1, event_id_next + 2);
}
MatCoord fidx{0};
for (fidx.k = 0; fidx.k < fdim.k; ++fidx.k) {
uint32_t k0_round = (fidx.k < fdim.k - 1) ? k_part_len : k_round - fidx.k * k_part_len;
uint32_t k0_actual = (fidx.k < fdim.k - 1) ? k_part_len : k_actual - fidx.k * k_part_len;
auto mte1_mad_ping_flag = 1 - fidx.k % 2;
auto mte1_mad_event_id = mte1_mad_ping_flag ? EVENT_ID0 : EVENT_ID1;
auto l0a_buf = l0a_base[(fidx.k % 2) * L0_PINGPONG_BUFFER_LEN];
auto l0b_buf = l0b_base[(fidx.k % 2) * L0_PINGPONG_BUFFER_LEN];
// *** load matrix A from L1 to L0A
if (fidx.k == 0) {
WAIT_FLAG(MTE2, MTE1, event_id);
}
WAIT_FLAG(M, MTE1, mte1_mad_event_id);
if ((m == 1) || (m_actual == 1 && !TA)) {
l1_to_l0_a<ArchType::ASCEND_V220, InDtype, false, DataFormat::VECTOR, DataFormat::VECTOR>(
l0a_buf, // dst
l1_buf_a[fidx.k * k_part_len], // src
0, // mTileCeil
CeilDiv<CONST_256>(k0_round), // kPartCeil
0, // mSrcStride
1, // kSrcStride
0, // mDstStride
0); // kDstStride
} else {
if (TA) {
LoadCbufToCa(l0a_buf, // l0Tensor
l1_buf_a[fidx.k * k_part_len * CONST_16], // l1Tensor
m_round, // mTileCeil
k0_round, // kPartCeil
k_round / CONST_16, // mSrcStride
1, // kSrcStride
k0_round / CONST_16, // mDstStride
1); // kDstStride
} else {
LoadCbufToCa(l0a_buf, // l0Tensor
l1_buf_a[fidx.k * k_part_len * m_round], // l1Tensor
m_round, // mTileCeil
k0_round, // kPartCeil
1, // mSrcStride
m_round / CONST_16, // kSrcStride
k0_round / CONST_16, // mDstStride
1); // kDstStride
}
}
if (fidx.k == fdim.k - 1) {
SET_FLAG(MTE1, MTE2, event_id);
}
// *** load matrix B from L1 to L0B
if (fidx.k == 0) {
WAIT_FLAG(MTE2, MTE1, event_id + 2);
}
if (TB) {
LoadCbufToCb(l0b_buf, // l0Tensor
l1_buf_b[fidx.k * k_part_len * n_round], // l1Tensor
n_round, // nTileCeil
k0_round, // kPartCeil
1, // nSrcStride
n_round / CONST_16, // kSrcStride
1, // nDstStride
k0_round / CONST_16); // kDstStride
} else {
LoadCbufToCb(l0b_buf, // l0Tensor
l1_buf_b[fidx.k * k_part_len * CONST_16], // l1Tensor
n_round, // nTileCeil
k0_round, // kPartCeil
k_round / CONST_16, // nSrcStride
1, // kSrcStride
1, // nDstStride
n_round / CONST_16); // kDstStride
}
if (fidx.k == fdim.k - 1) {
SET_FLAG(MTE1, MTE2, event_id + 2);
}
SET_FLAG(MTE1, M, mte1_mad_event_id);
WAIT_FLAG(MTE1, M, mte1_mad_event_id);
bool init_c = (tidx.k == 0 && fidx.k == 0);
if (init_c) {
WAIT_FLAG(FIX, M, EVENT_ID0);
}
if (m != 1 && m_actual == 1 && TA) {
Mad(l0c_buf, // c
l0a_buf, // a
l0b_buf, // b
CONST_16, // mTileActual
n_actual, // nTileActual
k0_actual, // kTileActual
init_c); // initC
} else {
Mad(l0c_buf, // c
l0a_buf, // a
l0b_buf, // b
m_actual, // mTileActual
n_actual, // nTileActual
k0_actual, // kTileActual
init_c); // initC
}
PIPE_BARRIER(M);
SET_FLAG(M, MTE1, mte1_mad_event_id);
}
ping_flag = 1 - ping_flag;
}
SET_FLAG(M, FIX, EVENT_ID0);
WAIT_FLAG(M, FIX, EVENT_ID0);
// copy from L0C to gm
CopyCcToGm(gm_c[offset_c], // dst
l0c_buf, // src
m_actual, // mTileActual
n_actual, // nTileActual
m_round, // mTileCeil
n * batch_size); // nActual
SET_FLAG(FIX, M, EVENT_ID0);
}
WAIT_FLAG(M, MTE1, EVENT_ID0);
WAIT_FLAG(M, MTE1, EVENT_ID1);
WAIT_FLAG(MTE1, MTE2, EVENT_ID0);
WAIT_FLAG(MTE1, MTE2, EVENT_ID1);
WAIT_FLAG(MTE1, MTE2, EVENT_ID2);
WAIT_FLAG(MTE1, MTE2, EVENT_ID3);
WAIT_FLAG(FIX, M, EVENT_ID0);
PIPE_BARRIER(ALL);
}
private:
AscendC::GlobalTensor<InDtype> gm_a;
AscendC::GlobalTensor<InDtype> gm_b;
AscendC::GlobalTensor<OutDtype> gm_c;
AscendC::LocalTensor<InDtype> l1_base_a;
AscendC::LocalTensor<InDtype> l1_base_b;
AscendC::LocalTensor<InDtype> l0a_base;
AscendC::LocalTensor<InDtype> l0b_base;
AscendC::LocalTensor<float> l0c_buf;
uint32_t num_core{0};
uint32_t batch_size{0};
uint32_t m{0};
uint32_t k{0};
uint32_t n{0};
uint32_t m0{0};
uint32_t k0{0};
uint32_t n0{0};
MatCoord tdim{0};
MatCoord fdim{0};
uint32_t core_loop{0};
uint32_t swizzle_cnt{1};
uint32_t core_idx{0};
uint32_t en_shuffle_k{0};
uint32_t ping_flag{0};
};
extern "C" __global__ __aicore__ void batch_matmul_transpose(GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_c,
GM_ADDR gm_tiling_data)
{
KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY);
PpMatmulEinSum<0, false, false, half, half, DataFormat::ND>
einsum_0_n_fp16_nd; // swizzleDir[0] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<1, false, false, half, half, DataFormat::ND>
einsum_1_n_fp16_nd; // swizzleDir[1] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<0, false, true, half, half, DataFormat::ND>
einsum_0_t_fp16_nd; // swizzleDir[0] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<1, false, true, half, half, DataFormat::ND>
einsum_1_t_fp16_nd; // swizzleDir[1] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<0, false, false, __bf16, __bf16, DataFormat::ND>
einsum_0_n_bf16_nd; // swizzleDir[0] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<1, false, false, __bf16, __bf16, DataFormat::ND>
einsum_1_n_bf16_nd; // swizzleDir[1] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<0, false, true, __bf16, __bf16, DataFormat::ND>
einsum_0_t_bf16_nd; // swizzleDir[0] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<1, false, true, __bf16, __bf16, DataFormat::ND>
einsum_1_t_bf16_nd; // swizzleDir[1] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[0]
PpMatmulEinSum<0, false, false, half, half, DataFormat::NZ>
einsum_0_n_fp16_nz; // swizzleDir[0] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<1, false, false, half, half, DataFormat::NZ>
einsum_1_n_fp16_nz; // swizzleDir[1] transA[0] transB[0] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<0, false, true, half, half, DataFormat::NZ>
einsum_0_t_fp16_nz; // swizzleDir[0] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<1, false, true, half, half, DataFormat::NZ>
einsum_1_t_fp16_nz; // swizzleDir[1] transA[0] transB[1] DtypeA[001] DtypeB[001] DtypeC[001] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<0, false, false, __bf16, __bf16, DataFormat::NZ>
einsum_0_n_bf16_nz; // swizzleDir[0] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<1, false, false, __bf16, __bf16, DataFormat::NZ>
einsum_1_n_bf16_nz; // swizzleDir[1] transA[0] transB[0] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<0, false, true, __bf16, __bf16, DataFormat::NZ>
einsum_0_t_bf16_nz; // swizzleDir[0] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[1]
PpMatmulEinSum<1, false, true, __bf16, __bf16, DataFormat::NZ>
einsum_1_t_bf16_nz; // swizzleDir[1] transA[0] transB[1] DtypeA[010] DtypeB[010] DtypeC[010] DataFormatA[0]
// DataFormatB[1]
SetPadding<uint64_t>((uint64_t)0);
SetNdpara(1, 0, 0);
SetAtomicnone();
// get tiling args
auto tiling_data = reinterpret_cast<__gm__ pp_matmul::PpMatmulTilingData *>(gm_tiling_data);
uint32_t masked_key = tiling_data->tilingKey >> 2;
switch (masked_key) {
case 0b00000100100100:
case 0b01000100100100:
einsum_0_n_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_n_fp16_nd.Process();
break;
case 0b00100100100100:
case 0b01100100100100:
einsum_0_t_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_t_fp16_nd.Process();
break;
case 0b10000100100100:
case 0b11000100100100:
einsum_1_n_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_n_fp16_nd.Process();
break;
case 0b10100100100100:
case 0b11100100100100:
einsum_1_t_fp16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_t_fp16_nd.Process();
break;
case 0b00001001001000:
case 0b01001001001000:
einsum_0_n_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_n_bf16_nd.Process();
break;
case 0b00101001001000:
case 0b01101001001000:
einsum_0_t_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_t_bf16_nd.Process();
break;
case 0b10001001001000:
case 0b11001001001000:
einsum_1_n_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_n_bf16_nd.Process();
break;
case 0b10101001001000:
case 0b11101001001000:
einsum_1_t_bf16_nd.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_t_bf16_nd.Process();
break;
case 0b00000100100101:
case 0b01000100100101:
einsum_0_n_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_n_fp16_nz.Process();
break;
case 0b00100100100101:
case 0b01100100100101:
einsum_0_t_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_t_fp16_nz.Process();
break;
case 0b10000100100101:
case 0b11000100100101:
einsum_1_n_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_n_fp16_nz.Process();
break;
case 0b10100100100101:
case 0b11100100100101:
einsum_1_t_fp16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_t_fp16_nz.Process();
break;
case 0b00001001001001:
case 0b01001001001001:
einsum_0_n_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_n_bf16_nz.Process();
break;
case 0b00101001001001:
case 0b01101001001001:
einsum_0_t_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_0_t_bf16_nz.Process();
break;
case 0b10001001001001:
case 0b11001001001001:
einsum_1_n_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_n_bf16_nz.Process();
break;
case 0b10101001001001:
case 0b11101001001001:
einsum_1_t_bf16_nz.Init(gm_a, gm_b, gm_c, gm_tiling_data);
einsum_1_t_bf16_nz.Process();
break;
default:
break;
}
}
namespace vllm_ascend {
extern void batch_matmul_transpose_impl(
void* stream,
void* gm_a,
void* gm_b,
void* gm_c,
void* gm_tiling_data,
const uint32_t block_dim)
{
batch_matmul_transpose<<<block_dim, nullptr, stream>>>(
gm_a,
gm_b,
gm_c,
gm_tiling_data);
}
}

View File

@@ -15,6 +15,11 @@
*/
#include <iostream>
#include <stdexcept>
#include <string>
#include <atomic>
#include "idle_offload/shm_worker.h"
extern "C" {
@@ -24,6 +29,13 @@ extern "C" {
#include <sys/types.h>
#include "acl/acl.h"
// idle offload
static std::atomic<bool> g_initialized(false);
static void *g_d_mem = nullptr;
static size_t g_size = 0;
static std::atomic_uint_fast64_t g_allocated_offset(0);
ShmWorker *shm_worker = nullptr;
// Global references to Python callables
// NOTE: this is borrowed reference, so we don't need to DECREF them.
// This brings the limitation that the allocator needs to be singleton.
@@ -49,7 +61,7 @@ void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
ensure_context(device);
// Define memory allocation properties
aclrtPhysicalMemProp prop = {};
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
prop.memAttr = ACL_HBM_MEM_HUGE;
prop.location.id = device;
@@ -59,15 +71,21 @@ void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
// Allocate memory using aclrtMallocPhysical
aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return;
if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
__FILE__ + ":" + std::to_string(__LINE__));
} else {
throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
}
// Map memory
error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return;
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
}
@@ -79,15 +97,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
ensure_context(device);
aclError error_code = aclrtUnmapMem(d_mem);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return;
throw std::runtime_error("aclrtUnmapMem failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
error_code = aclrtFreePhysical(*p_memHandle);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return;
throw std::runtime_error("aclrtFreePhysical failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
}
@@ -139,25 +155,29 @@ __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
&granularity);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return nullptr;
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
void *d_mem;
error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return nullptr;
if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
__FILE__ + ":" + std::to_string(__LINE__));
} else {
throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
}
// allocate the aclrtDrvMemHandle
aclrtDrvMemHandle* p_memHandle =
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
if (!g_python_malloc_callback) {
std::cerr << "ERROR: g_python_malloc_callback not set.\n";
return nullptr;
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
@@ -189,8 +209,8 @@ __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device
__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
// get memory handle from the pointer
if (!g_python_free_callback) {
std::cerr << "ERROR: g_python_free_callback not set.\n";
return;
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
@@ -232,13 +252,150 @@ __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, in
// free address and the handle
aclError error_code = aclrtReleaseMemAddress(d_mem);
if (error_code != 0) {
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl;
return;
throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
free(p_memHandle);
}
__attribute__((visibility("default"))) void *
my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
ensure_context(device);
// first allocation, align the size, and reserve an address, and also allocate
// a aclrtDrvMemHandle
// Define memory allocation properties
aclrtPhysicalMemProp prop = {};
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
prop.memAttr = ACL_HBM_MEM_HUGE;
prop.location.id = device;
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
prop.reserve = 0;
// Check if the allocation is supported
size_t granularity;
aclError error_code = aclrtMemGetAllocationGranularity(&prop,
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
&granularity);
if (error_code != 0) {
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
void *d_mem;
// error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
// if (error_code != 0) {
// if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
// std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
// __FILE__ + ":" + std::to_string(__LINE__));
// } else {
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
// }
// }
// allocate from the reserved pool
size_t alloc_offset = g_allocated_offset.fetch_add(alignedSize);
if (alloc_offset + alignedSize > g_size) {
throw std::runtime_error(
"my_malloc ERROR: Out of memory in the reserved pool." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
d_mem = (void *)((char *)g_d_mem + alloc_offset);
// allocate the aclrtDrvMemHandle
aclrtDrvMemHandle* p_memHandle =
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
if (!g_python_malloc_callback) {
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
PyGILState_STATE gstate = PyGILState_Ensure();
PyObject* arg_tuple = create_tuple_from_c_integers(
(unsigned long long)device, (unsigned long long)alignedSize,
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
// Call g_python_malloc_callback
PyObject* py_result =
PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
Py_DECREF(arg_tuple);
if (!py_result) {
PyErr_Print();
PyGILState_Release(gstate);
return nullptr;
}
PyGILState_Release(gstate);
// // do the final mapping
// create_and_map(device, alignedSize, d_mem, p_memHandle);
return (void*)d_mem;
}
__attribute__((visibility("default"))) void
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
// get memory handle from the pointer
if (!g_python_free_callback) {
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
PyGILState_STATE gstate = PyGILState_Ensure();
PyObject* py_ptr =
PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
PyObject* py_result =
PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
return;
}
unsigned long long recv_device, recv_size;
unsigned long long recv_d_mem, recv_p_memHandle;
// Unpack the tuple into four C integers
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
&recv_d_mem, &recv_p_memHandle)) {
// PyArg_ParseTuple sets an error if it fails
return;
}
PyGILState_Release(gstate);
// recv_size == size
// recv_device == device
// Free memory
// nothing to do
// void *d_mem = (void*)recv_d_mem;
// // allocate the aclrtDrvMemHandle
// aclrtDrvMemHandle* p_memHandle =
// (aclrtDrvMemHandle*)recv_p_memHandle;
// unmap_and_release(device, size, d_mem, p_memHandle);
// // free address and the handle
// aclError error_code = aclrtReleaseMemAddress(d_mem);
// if (error_code != 0) {
// throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
// }
// free(p_memHandle);
}
// ---------------------------------------------------------------------------
// Python extension boilerplate:
@@ -313,6 +470,116 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
Py_RETURN_NONE;
}
static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
PyObject* malloc_callback = nullptr;
PyObject* free_callback = nullptr;
if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
return nullptr;
}
if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
return nullptr;
}
// Save the Python callables
// This module does not handle GC of these objects, so they must be kept alive
// outside of this module.
g_python_malloc_callback = malloc_callback;
g_python_free_callback = free_callback;
// init idle
if (g_initialized.load()) {
printf("Module already initialized.\n");
Py_RETURN_NONE;
}
g_initialized.store(true);
shm_worker = new ShmWorker();
// get pid
aclError error_code;
int32_t pid;
error_code = aclrtDeviceGetBareTgid(&pid);
if (error_code != 0) {
throw std::runtime_error(
"aclrtDeviceGetBareTgid failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
uint64_t shareable_handle;
shm_worker->register_worker(pid, &shareable_handle, &g_size);
// import shareable handle
uint32_t device = 0;
aclrtDrvMemHandle memHandle;
error_code =
aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
if (error_code != 0) {
throw std::runtime_error(
"aclrtMemImportFromShareableHandle failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
// reserve virtual address
error_code = aclrtReserveMemAddress(&g_d_mem, g_size, 0, nullptr, 0);
if (error_code != 0) {
throw std::runtime_error(
"aclrtReserveMemAddress failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
// map
error_code = aclrtMapMem(g_d_mem, g_size, 0, memHandle, 0);
if (error_code != 0) {
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
Py_RETURN_NONE;
}
static PyObject *python_unmap_and_release_offload(PyObject *self,
PyObject *args) {
// nothing to do
Py_RETURN_NONE;
}
static PyObject *python_create_and_map_offload(PyObject *self, PyObject *args) {
// nothing to do
Py_RETURN_NONE;
}
static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
size_t allocated_bytes = g_allocated_offset.load();
size_t free_mem = 0;
if (allocated_bytes >= g_size) {
free_mem = 0;
} else {
free_mem = g_size - allocated_bytes;
}
PyObject* tuple = PyTuple_New(2);
if (!tuple) {
return nullptr;
}
PyTuple_SetItem(tuple, 0, PyLong_FromSize_t(free_mem));
PyTuple_SetItem(tuple, 1, PyLong_FromSize_t(g_size));
return tuple;
}
static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
bool prev_is_self = shm_worker->lock_gpu();
return PyBool_FromLong(prev_is_self);
}
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
shm_worker->unlock_gpu();
Py_RETURN_NONE;
}
static PyMethodDef module_methods[] = {
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
"Initialize module with python_malloc and python_free callables."},
@@ -320,7 +587,21 @@ static PyMethodDef module_methods[] = {
"Create and map memory on the device."},
{"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
METH_VARARGS, "Unmap and release memory on the device."},
{NULL, NULL, 0, NULL} // sentinel
{"init_module_offload", (PyCFunction)py_init_module_offload, METH_VARARGS,
"Initialize module with python_malloc and python_free callables."},
{"python_create_and_map_offload",
(PyCFunction)python_create_and_map_offload, METH_VARARGS,
"Create and map memory on the device."},
{"python_unmap_and_release_offload",
(PyCFunction)python_unmap_and_release_offload, METH_VARARGS,
"Unmap and release memory on the device."},
{"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
METH_NOARGS, "Get mem info in the reserved pool."},
{"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
METH_NOARGS, "Lock GPU."},
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
METH_NOARGS, "Unlock GPU."},
{NULL, NULL, 0, NULL} // sentinel
};
static struct PyModuleDef camem_allocator_module = {

1
csrc/idle_offload/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
vllm_vnpu_daemon

View File

@@ -0,0 +1,30 @@
CXX := g++
TARGET := vllm_vnpu_daemon
SRCS := offload_daemon.cpp shm_manager.cpp
ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
LIBS := -L$(ASCEND_HOME)/lib64 -lascendcl
CXXFLAGS := $(INCLUDES)
LDFLAGS := $(LIBS)
PREFIX ?= /usr/local
BINDIR ?= $(PREFIX)/bin
.PHONY: all clean install uninstall
all: $(TARGET)
$(TARGET): $(SRCS)
$(CXX) -o $@ $^ $(CXXFLAGS) $(LDFLAGS)
install: $(TARGET)
install -d $(DESTDIR)$(BINDIR)
install -m 0755 $(TARGET) $(DESTDIR)$(BINDIR)/$(TARGET)
uninstall:
rm -f $(DESTDIR)$(BINDIR)/$(TARGET)
clean:
rm -f $(TARGET)

View File

@@ -0,0 +1,99 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
//
// Async logging using global thread pool
// All loggers created here share same global thread pool.
// Each log message is pushed to a queue along with a shared pointer to the
// logger.
// If a logger deleted while having pending messages in the queue, it's actual
// destruction will defer
// until all its messages are processed by the thread pool.
// This is because each message in the queue holds a shared_ptr to the
// originating logger.
#include <spdlog/async_logger.h>
#include <spdlog/details/registry.h>
#include <spdlog/details/thread_pool.h>
#include <functional>
#include <memory>
#include <mutex>
namespace spdlog {
namespace details {
static const size_t default_async_q_size = 8192;
}
// async logger factory - creates async loggers backed with thread pool.
// if a global thread pool doesn't already exist, create it with default queue
// size of 8192 items and single thread.
template <async_overflow_policy OverflowPolicy = async_overflow_policy::block>
struct async_factory_impl {
template <typename Sink, typename... SinkArgs>
static std::shared_ptr<async_logger> create(std::string logger_name, SinkArgs &&...args) {
auto &registry_inst = details::registry::instance();
// create global thread pool if not already exists..
auto &mutex = registry_inst.tp_mutex();
std::lock_guard<std::recursive_mutex> tp_lock(mutex);
auto tp = registry_inst.get_tp();
if (tp == nullptr) {
tp = std::make_shared<details::thread_pool>(details::default_async_q_size, 1U);
registry_inst.set_tp(tp);
}
auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
auto new_logger = std::make_shared<async_logger>(std::move(logger_name), std::move(sink),
std::move(tp), OverflowPolicy);
registry_inst.initialize_logger(new_logger);
return new_logger;
}
};
using async_factory = async_factory_impl<async_overflow_policy::block>;
using async_factory_nonblock = async_factory_impl<async_overflow_policy::overrun_oldest>;
template <typename Sink, typename... SinkArgs>
inline std::shared_ptr<spdlog::logger> create_async(std::string logger_name,
SinkArgs &&...sink_args) {
return async_factory::create<Sink>(std::move(logger_name),
std::forward<SinkArgs>(sink_args)...);
}
template <typename Sink, typename... SinkArgs>
inline std::shared_ptr<spdlog::logger> create_async_nb(std::string logger_name,
SinkArgs &&...sink_args) {
return async_factory_nonblock::create<Sink>(std::move(logger_name),
std::forward<SinkArgs>(sink_args)...);
}
// set global thread pool.
inline void init_thread_pool(size_t q_size,
size_t thread_count,
std::function<void()> on_thread_start,
std::function<void()> on_thread_stop) {
auto tp = std::make_shared<details::thread_pool>(q_size, thread_count, on_thread_start,
on_thread_stop);
details::registry::instance().set_tp(std::move(tp));
}
inline void init_thread_pool(size_t q_size,
size_t thread_count,
std::function<void()> on_thread_start) {
init_thread_pool(q_size, thread_count, on_thread_start, [] {});
}
inline void init_thread_pool(size_t q_size, size_t thread_count) {
init_thread_pool(q_size, thread_count, [] {}, [] {});
}
// get the global thread pool.
inline std::shared_ptr<spdlog::details::thread_pool> thread_pool() {
return details::registry::instance().get_tp();
}
} // namespace spdlog

View File

@@ -0,0 +1,84 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/async_logger.h>
#endif
#include <spdlog/details/thread_pool.h>
#include <spdlog/sinks/sink.h>
#include <memory>
#include <string>
SPDLOG_INLINE spdlog::async_logger::async_logger(std::string logger_name,
sinks_init_list sinks_list,
std::weak_ptr<details::thread_pool> tp,
async_overflow_policy overflow_policy)
: async_logger(std::move(logger_name),
sinks_list.begin(),
sinks_list.end(),
std::move(tp),
overflow_policy) {}
SPDLOG_INLINE spdlog::async_logger::async_logger(std::string logger_name,
sink_ptr single_sink,
std::weak_ptr<details::thread_pool> tp,
async_overflow_policy overflow_policy)
: async_logger(
std::move(logger_name), {std::move(single_sink)}, std::move(tp), overflow_policy) {}
// send the log message to the thread pool
SPDLOG_INLINE void spdlog::async_logger::sink_it_(const details::log_msg &msg){
SPDLOG_TRY{if (auto pool_ptr = thread_pool_.lock()){
pool_ptr -> post_log(shared_from_this(), msg, overflow_policy_);
}
else {
throw_spdlog_ex("async log: thread pool doesn't exist anymore");
}
}
SPDLOG_LOGGER_CATCH(msg.source)
}
// send flush request to the thread pool
SPDLOG_INLINE void spdlog::async_logger::flush_(){
SPDLOG_TRY{if (auto pool_ptr = thread_pool_.lock()){
pool_ptr -> post_flush(shared_from_this(), overflow_policy_);
}
else {
throw_spdlog_ex("async flush: thread pool doesn't exist anymore");
}
}
SPDLOG_LOGGER_CATCH(source_loc())
}
//
// backend functions - called from the thread pool to do the actual job
//
SPDLOG_INLINE void spdlog::async_logger::backend_sink_it_(const details::log_msg &msg) {
for (auto &sink : sinks_) {
if (sink->should_log(msg.level)) {
SPDLOG_TRY { sink->log(msg); }
SPDLOG_LOGGER_CATCH(msg.source)
}
}
if (should_flush_(msg)) {
backend_flush_();
}
}
SPDLOG_INLINE void spdlog::async_logger::backend_flush_() {
for (auto &sink : sinks_) {
SPDLOG_TRY { sink->flush(); }
SPDLOG_LOGGER_CATCH(source_loc())
}
}
SPDLOG_INLINE std::shared_ptr<spdlog::logger> spdlog::async_logger::clone(std::string new_name) {
auto cloned = std::make_shared<spdlog::async_logger>(*this);
cloned->name_ = std::move(new_name);
return cloned;
}

View File

@@ -0,0 +1,74 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
// Fast asynchronous logger.
// Uses pre allocated queue.
// Creates a single back thread to pop messages from the queue and log them.
//
// Upon each log write the logger:
// 1. Checks if its log level is enough to log the message
// 2. Push a new copy of the message to a queue (or block the caller until
// space is available in the queue)
// Upon destruction, logs all remaining messages in the queue before
// destructing..
#include <spdlog/logger.h>
namespace spdlog {
// Async overflow policy - block by default.
enum class async_overflow_policy {
block, // Block until message can be enqueued
overrun_oldest, // Discard oldest message in the queue if full when trying to
// add new item.
discard_new // Discard new message if the queue is full when trying to add new item.
};
namespace details {
class thread_pool;
}
class SPDLOG_API async_logger final : public std::enable_shared_from_this<async_logger>,
public logger {
friend class details::thread_pool;
public:
template <typename It>
async_logger(std::string logger_name,
It begin,
It end,
std::weak_ptr<details::thread_pool> tp,
async_overflow_policy overflow_policy = async_overflow_policy::block)
: logger(std::move(logger_name), begin, end),
thread_pool_(std::move(tp)),
overflow_policy_(overflow_policy) {}
async_logger(std::string logger_name,
sinks_init_list sinks_list,
std::weak_ptr<details::thread_pool> tp,
async_overflow_policy overflow_policy = async_overflow_policy::block);
async_logger(std::string logger_name,
sink_ptr single_sink,
std::weak_ptr<details::thread_pool> tp,
async_overflow_policy overflow_policy = async_overflow_policy::block);
std::shared_ptr<logger> clone(std::string new_name) override;
protected:
void sink_it_(const details::log_msg &msg) override;
void flush_() override;
void backend_sink_it_(const details::log_msg &incoming_log_msg);
void backend_flush_();
private:
std::weak_ptr<details::thread_pool> thread_pool_;
async_overflow_policy overflow_policy_;
};
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "async_logger-inl.h"
#endif

View File

@@ -0,0 +1,40 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/cfg/helpers.h>
#include <spdlog/details/registry.h>
//
// Init log levels using each argv entry that starts with "SPDLOG_LEVEL="
//
// set all loggers to debug level:
// example.exe "SPDLOG_LEVEL=debug"
// set logger1 to trace level
// example.exe "SPDLOG_LEVEL=logger1=trace"
// turn off all logging except for logger1 and logger2:
// example.exe "SPDLOG_LEVEL=off,logger1=debug,logger2=info"
namespace spdlog {
namespace cfg {
// search for SPDLOG_LEVEL= in the args and use it to init the levels
inline void load_argv_levels(int argc, const char **argv) {
const std::string spdlog_level_prefix = "SPDLOG_LEVEL=";
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg.find(spdlog_level_prefix) == 0) {
auto levels_string = arg.substr(spdlog_level_prefix.size());
helpers::load_levels(levels_string);
}
}
}
inline void load_argv_levels(int argc, char **argv) {
load_argv_levels(argc, const_cast<const char **>(argv));
}
} // namespace cfg
} // namespace spdlog

View File

@@ -0,0 +1,36 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/cfg/helpers.h>
#include <spdlog/details/os.h>
#include <spdlog/details/registry.h>
//
// Init levels and patterns from env variables SPDLOG_LEVEL
// Inspired from Rust's "env_logger" crate (https://crates.io/crates/env_logger).
// Note - fallback to "info" level on unrecognized levels
//
// Examples:
//
// set global level to debug:
// export SPDLOG_LEVEL=debug
//
// turn off all logging except for logger1:
// export SPDLOG_LEVEL="*=off,logger1=debug"
//
// turn off all logging except for logger1 and logger2:
// export SPDLOG_LEVEL="off,logger1=debug,logger2=info"
namespace spdlog {
namespace cfg {
inline void load_env_levels(const char* var = "SPDLOG_LEVEL") {
auto env_val = details::os::getenv(var);
if (!env_val.empty()) {
helpers::load_levels(env_val);
}
}
} // namespace cfg
} // namespace spdlog

View File

@@ -0,0 +1,106 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/cfg/helpers.h>
#endif
#include <spdlog/details/os.h>
#include <spdlog/details/registry.h>
#include <algorithm>
#include <sstream>
#include <string>
#include <utility>
namespace spdlog {
namespace cfg {
namespace helpers {
// inplace convert to lowercase
inline std::string &to_lower_(std::string &str) {
std::transform(str.begin(), str.end(), str.begin(), [](char ch) {
return static_cast<char>((ch >= 'A' && ch <= 'Z') ? ch + ('a' - 'A') : ch);
});
return str;
}
// inplace trim spaces
inline std::string &trim_(std::string &str) {
const char *spaces = " \n\r\t";
str.erase(str.find_last_not_of(spaces) + 1);
str.erase(0, str.find_first_not_of(spaces));
return str;
}
// return (name,value) trimmed pair from the given "name = value" string.
// return empty string on missing parts
// "key=val" => ("key", "val")
// " key = val " => ("key", "val")
// "key=" => ("key", "")
// "val" => ("", "val")
inline std::pair<std::string, std::string> extract_kv_(char sep, const std::string &str) {
auto n = str.find(sep);
std::string k, v;
if (n == std::string::npos) {
v = str;
} else {
k = str.substr(0, n);
v = str.substr(n + 1);
}
return std::make_pair(trim_(k), trim_(v));
}
// return vector of key/value pairs from a sequence of "K1=V1,K2=V2,.."
// "a=AAA,b=BBB,c=CCC,.." => {("a","AAA"),("b","BBB"),("c", "CCC"),...}
inline std::unordered_map<std::string, std::string> extract_key_vals_(const std::string &str) {
std::string token;
std::istringstream token_stream(str);
std::unordered_map<std::string, std::string> rv{};
while (std::getline(token_stream, token, ',')) {
if (token.empty()) {
continue;
}
auto kv = extract_kv_('=', token);
rv[kv.first] = kv.second;
}
return rv;
}
SPDLOG_INLINE void load_levels(const std::string &input) {
if (input.empty() || input.size() >= 32768) {
return;
}
auto key_vals = extract_key_vals_(input);
std::unordered_map<std::string, level::level_enum> levels;
level::level_enum global_level = level::info;
bool global_level_found = false;
for (auto &name_level : key_vals) {
const auto &logger_name = name_level.first;
const auto &level_name = to_lower_(name_level.second);
auto level = level::from_str(level_name);
// ignore unrecognized level names
if (level == level::off && level_name != "off") {
continue;
}
if (logger_name.empty()) // no logger name indicates global level
{
global_level_found = true;
global_level = level;
} else {
levels[logger_name] = level;
}
}
details::registry::instance().set_levels(std::move(levels),
global_level_found ? &global_level : nullptr);
}
} // namespace helpers
} // namespace cfg
} // namespace spdlog

View File

@@ -0,0 +1,29 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/common.h>
#include <unordered_map>
namespace spdlog {
namespace cfg {
namespace helpers {
//
// Init levels from given string
//
// Examples:
//
// set global level to debug: "debug"
// turn off all logging except for logger1: "off,logger1=debug"
// turn off all logging except for logger1 and logger2: "off,logger1=debug,logger2=info"
//
SPDLOG_API void load_levels(const std::string &txt);
} // namespace helpers
} // namespace cfg
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "helpers-inl.h"
#endif // SPDLOG_HEADER_ONLY

View File

@@ -0,0 +1,68 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/common.h>
#endif
#include <algorithm>
#include <iterator>
namespace spdlog {
namespace level {
#if __cplusplus >= 201703L
constexpr
#endif
static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES;
static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES;
SPDLOG_INLINE const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
return level_string_views[l];
}
SPDLOG_INLINE const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
return short_level_names[l];
}
SPDLOG_INLINE spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT {
auto it = std::find(std::begin(level_string_views), std::end(level_string_views), name);
if (it != std::end(level_string_views))
return static_cast<level::level_enum>(std::distance(std::begin(level_string_views), it));
// check also for "warn" and "err" before giving up..
if (name == "warn") {
return level::warn;
}
if (name == "err") {
return level::err;
}
return level::off;
}
} // namespace level
SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg)
: msg_(std::move(msg)) {}
SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno) {
#ifdef SPDLOG_USE_STD_FORMAT
msg_ = std::system_error(std::error_code(last_errno, std::generic_category()), msg).what();
#else
memory_buf_t outbuf;
fmt::format_system_error(outbuf, last_errno, msg.c_str());
msg_ = fmt::to_string(outbuf);
#endif
}
SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT { return msg_.c_str(); }
SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno) {
SPDLOG_THROW(spdlog_ex(msg, last_errno));
}
SPDLOG_INLINE void throw_spdlog_ex(std::string msg) { SPDLOG_THROW(spdlog_ex(std::move(msg))); }
} // namespace spdlog

View File

@@ -0,0 +1,406 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/details/null_mutex.h>
#include <spdlog/tweakme.h>
#include <atomic>
#include <chrono>
#include <cstdio>
#include <exception>
#include <functional>
#include <initializer_list>
#include <memory>
#include <string>
#include <type_traits>
#ifdef SPDLOG_USE_STD_FORMAT
#include <version>
#if __cpp_lib_format >= 202207L
#include <format>
#else
#include <string_view>
#endif
#endif
#ifdef SPDLOG_COMPILED_LIB
#undef SPDLOG_HEADER_ONLY
#if defined(SPDLOG_SHARED_LIB)
#if defined(_WIN32)
#ifdef spdlog_EXPORTS
#define SPDLOG_API __declspec(dllexport)
#else // !spdlog_EXPORTS
#define SPDLOG_API __declspec(dllimport)
#endif
#else // !defined(_WIN32)
#define SPDLOG_API __attribute__((visibility("default")))
#endif
#else // !defined(SPDLOG_SHARED_LIB)
#define SPDLOG_API
#endif
#define SPDLOG_INLINE
#else // !defined(SPDLOG_COMPILED_LIB)
#define SPDLOG_API
#define SPDLOG_HEADER_ONLY
#define SPDLOG_INLINE inline
#endif // #ifdef SPDLOG_COMPILED_LIB
#include <spdlog/fmt/fmt.h>
#if !defined(SPDLOG_USE_STD_FORMAT) && \
FMT_VERSION >= 80000 // backward compatibility with fmt versions older than 8
#define SPDLOG_FMT_RUNTIME(format_string) fmt::runtime(format_string)
#define SPDLOG_FMT_STRING(format_string) FMT_STRING(format_string)
#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
#include <spdlog/fmt/xchar.h>
#endif
#else
#define SPDLOG_FMT_RUNTIME(format_string) format_string
#define SPDLOG_FMT_STRING(format_string) format_string
#endif
// visual studio up to 2013 does not support noexcept nor constexpr
#if defined(_MSC_VER) && (_MSC_VER < 1900)
#define SPDLOG_NOEXCEPT _NOEXCEPT
#define SPDLOG_CONSTEXPR
#else
#define SPDLOG_NOEXCEPT noexcept
#define SPDLOG_CONSTEXPR constexpr
#endif
// If building with std::format, can just use constexpr, otherwise if building with fmt
// SPDLOG_CONSTEXPR_FUNC needs to be set the same as FMT_CONSTEXPR to avoid situations where
// a constexpr function in spdlog could end up calling a non-constexpr function in fmt
// depending on the compiler
// If fmt determines it can't use constexpr, we should inline the function instead
#ifdef SPDLOG_USE_STD_FORMAT
#define SPDLOG_CONSTEXPR_FUNC constexpr
#else // Being built with fmt
#if FMT_USE_CONSTEXPR
#define SPDLOG_CONSTEXPR_FUNC FMT_CONSTEXPR
#else
#define SPDLOG_CONSTEXPR_FUNC inline
#endif
#endif
#if defined(__GNUC__) || defined(__clang__)
#define SPDLOG_DEPRECATED __attribute__((deprecated))
#elif defined(_MSC_VER)
#define SPDLOG_DEPRECATED __declspec(deprecated)
#else
#define SPDLOG_DEPRECATED
#endif
// disable thread local on msvc 2013
#ifndef SPDLOG_NO_TLS
#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__cplusplus_winrt)
#define SPDLOG_NO_TLS 1
#endif
#endif
#ifndef SPDLOG_FUNCTION
#define SPDLOG_FUNCTION static_cast<const char *>(__FUNCTION__)
#endif
#ifdef SPDLOG_NO_EXCEPTIONS
#define SPDLOG_TRY
#define SPDLOG_THROW(ex) \
do { \
printf("spdlog fatal error: %s\n", ex.what()); \
std::abort(); \
} while (0)
#define SPDLOG_CATCH_STD
#else
#define SPDLOG_TRY try
#define SPDLOG_THROW(ex) throw(ex)
#define SPDLOG_CATCH_STD \
catch (const std::exception &) { \
}
#endif
namespace spdlog {
class formatter;
namespace sinks {
class sink;
}
#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
using filename_t = std::wstring;
// allow macro expansion to occur in SPDLOG_FILENAME_T
#define SPDLOG_FILENAME_T_INNER(s) L##s
#define SPDLOG_FILENAME_T(s) SPDLOG_FILENAME_T_INNER(s)
#else
using filename_t = std::string;
#define SPDLOG_FILENAME_T(s) s
#endif
using log_clock = std::chrono::system_clock;
using sink_ptr = std::shared_ptr<sinks::sink>;
using sinks_init_list = std::initializer_list<sink_ptr>;
using err_handler = std::function<void(const std::string &err_msg)>;
#ifdef SPDLOG_USE_STD_FORMAT
namespace fmt_lib = std;
using string_view_t = std::string_view;
using memory_buf_t = std::string;
template <typename... Args>
#if __cpp_lib_format >= 202207L
using format_string_t = std::format_string<Args...>;
#else
using format_string_t = std::string_view;
#endif
template <class T, class Char = char>
struct is_convertible_to_basic_format_string
: std::integral_constant<bool, std::is_convertible<T, std::basic_string_view<Char>>::value> {};
#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
using wstring_view_t = std::wstring_view;
using wmemory_buf_t = std::wstring;
template <typename... Args>
#if __cpp_lib_format >= 202207L
using wformat_string_t = std::wformat_string<Args...>;
#else
using wformat_string_t = std::wstring_view;
#endif
#endif
#define SPDLOG_BUF_TO_STRING(x) x
#else // use fmt lib instead of std::format
namespace fmt_lib = fmt;
using string_view_t = fmt::basic_string_view<char>;
using memory_buf_t = fmt::basic_memory_buffer<char, 250>;
template <typename... Args>
using format_string_t = fmt::format_string<Args...>;
template <class T>
using remove_cvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
template <typename Char>
#if FMT_VERSION >= 90101
using fmt_runtime_string = fmt::runtime_format_string<Char>;
#else
using fmt_runtime_string = fmt::basic_runtime<Char>;
#endif
// clang doesn't like SFINAE disabled constructor in std::is_convertible<> so have to repeat the
// condition from basic_format_string here, in addition, fmt::basic_runtime<Char> is only
// convertible to basic_format_string<Char> but not basic_string_view<Char>
template <class T, class Char = char>
struct is_convertible_to_basic_format_string
: std::integral_constant<bool,
std::is_convertible<T, fmt::basic_string_view<Char>>::value ||
std::is_same<remove_cvref_t<T>, fmt_runtime_string<Char>>::value> {
};
#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
using wstring_view_t = fmt::basic_string_view<wchar_t>;
using wmemory_buf_t = fmt::basic_memory_buffer<wchar_t, 250>;
template <typename... Args>
using wformat_string_t = fmt::wformat_string<Args...>;
#endif
#define SPDLOG_BUF_TO_STRING(x) fmt::to_string(x)
#endif
#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
#ifndef _WIN32
#error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
#endif // _WIN32
#endif // SPDLOG_WCHAR_TO_UTF8_SUPPORT
template <class T>
struct is_convertible_to_any_format_string
: std::integral_constant<bool,
is_convertible_to_basic_format_string<T, char>::value ||
is_convertible_to_basic_format_string<T, wchar_t>::value> {};
#if defined(SPDLOG_NO_ATOMIC_LEVELS)
using level_t = details::null_atomic_int;
#else
using level_t = std::atomic<int>;
#endif
#define SPDLOG_LEVEL_TRACE 0
#define SPDLOG_LEVEL_DEBUG 1
#define SPDLOG_LEVEL_INFO 2
#define SPDLOG_LEVEL_WARN 3
#define SPDLOG_LEVEL_ERROR 4
#define SPDLOG_LEVEL_CRITICAL 5
#define SPDLOG_LEVEL_OFF 6
#if !defined(SPDLOG_ACTIVE_LEVEL)
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
#endif
// Log level enum
namespace level {
enum level_enum : int {
trace = SPDLOG_LEVEL_TRACE,
debug = SPDLOG_LEVEL_DEBUG,
info = SPDLOG_LEVEL_INFO,
warn = SPDLOG_LEVEL_WARN,
err = SPDLOG_LEVEL_ERROR,
critical = SPDLOG_LEVEL_CRITICAL,
off = SPDLOG_LEVEL_OFF,
n_levels
};
#define SPDLOG_LEVEL_NAME_TRACE spdlog::string_view_t("trace", 5)
#define SPDLOG_LEVEL_NAME_DEBUG spdlog::string_view_t("debug", 5)
#define SPDLOG_LEVEL_NAME_INFO spdlog::string_view_t("info", 4)
#define SPDLOG_LEVEL_NAME_WARNING spdlog::string_view_t("warning", 7)
#define SPDLOG_LEVEL_NAME_ERROR spdlog::string_view_t("error", 5)
#define SPDLOG_LEVEL_NAME_CRITICAL spdlog::string_view_t("critical", 8)
#define SPDLOG_LEVEL_NAME_OFF spdlog::string_view_t("off", 3)
#if !defined(SPDLOG_LEVEL_NAMES)
#define SPDLOG_LEVEL_NAMES \
{ \
SPDLOG_LEVEL_NAME_TRACE, SPDLOG_LEVEL_NAME_DEBUG, SPDLOG_LEVEL_NAME_INFO, \
SPDLOG_LEVEL_NAME_WARNING, SPDLOG_LEVEL_NAME_ERROR, SPDLOG_LEVEL_NAME_CRITICAL, \
SPDLOG_LEVEL_NAME_OFF \
}
#endif
#if !defined(SPDLOG_SHORT_LEVEL_NAMES)
#define SPDLOG_SHORT_LEVEL_NAMES \
{ "T", "D", "I", "W", "E", "C", "O" }
#endif
SPDLOG_API const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
SPDLOG_API const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
SPDLOG_API spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT;
} // namespace level
//
// Color mode used by sinks with color support.
//
enum class color_mode { always, automatic, never };
//
// Pattern time - specific time getting to use for pattern_formatter.
// local time by default
//
enum class pattern_time_type {
local, // log localtime
utc // log utc
};
//
// Log exception
//
class SPDLOG_API spdlog_ex : public std::exception {
public:
explicit spdlog_ex(std::string msg);
spdlog_ex(const std::string &msg, int last_errno);
const char *what() const SPDLOG_NOEXCEPT override;
private:
std::string msg_;
};
[[noreturn]] SPDLOG_API void throw_spdlog_ex(const std::string &msg, int last_errno);
[[noreturn]] SPDLOG_API void throw_spdlog_ex(std::string msg);
struct source_loc {
SPDLOG_CONSTEXPR source_loc() = default;
SPDLOG_CONSTEXPR source_loc(const char *filename_in, int line_in, const char *funcname_in)
: filename{filename_in},
line{line_in},
funcname{funcname_in} {}
SPDLOG_CONSTEXPR bool empty() const SPDLOG_NOEXCEPT { return line <= 0; }
const char *filename{nullptr};
int line{0};
const char *funcname{nullptr};
};
struct file_event_handlers {
file_event_handlers()
: before_open(nullptr),
after_open(nullptr),
before_close(nullptr),
after_close(nullptr) {}
std::function<void(const filename_t &filename)> before_open;
std::function<void(const filename_t &filename, std::FILE *file_stream)> after_open;
std::function<void(const filename_t &filename, std::FILE *file_stream)> before_close;
std::function<void(const filename_t &filename)> after_close;
};
namespace details {
// to_string_view
SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(const memory_buf_t &buf)
SPDLOG_NOEXCEPT {
return spdlog::string_view_t{buf.data(), buf.size()};
}
SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(spdlog::string_view_t str)
SPDLOG_NOEXCEPT {
return str;
}
#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(const wmemory_buf_t &buf)
SPDLOG_NOEXCEPT {
return spdlog::wstring_view_t{buf.data(), buf.size()};
}
SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(spdlog::wstring_view_t str)
SPDLOG_NOEXCEPT {
return str;
}
#endif
#if defined(SPDLOG_USE_STD_FORMAT) && __cpp_lib_format >= 202207L
template <typename T, typename... Args>
SPDLOG_CONSTEXPR_FUNC std::basic_string_view<T> to_string_view(
std::basic_format_string<T, Args...> fmt) SPDLOG_NOEXCEPT {
return fmt.get();
}
#endif
// make_unique support for pre c++14
#if __cplusplus >= 201402L // C++14 and beyond
using std::enable_if_t;
using std::make_unique;
#else
template <bool B, class T = void>
using enable_if_t = typename std::enable_if<B, T>::type;
template <typename T, typename... Args>
std::unique_ptr<T> make_unique(Args &&...args) {
static_assert(!std::is_array<T>::value, "arrays not supported");
return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
#endif
// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
template <typename T, typename U, enable_if_t<!std::is_same<T, U>::value, int> = 0>
constexpr T conditional_static_cast(U value) {
return static_cast<T>(value);
}
template <typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
constexpr T conditional_static_cast(U value) {
return value;
}
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "common-inl.h"
#endif

View File

@@ -0,0 +1,63 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/backtracer.h>
#endif
namespace spdlog {
namespace details {
SPDLOG_INLINE backtracer::backtracer(const backtracer &other) {
std::lock_guard<std::mutex> lock(other.mutex_);
enabled_ = other.enabled();
messages_ = other.messages_;
}
SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT {
std::lock_guard<std::mutex> lock(other.mutex_);
enabled_ = other.enabled();
messages_ = std::move(other.messages_);
}
SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other) {
std::lock_guard<std::mutex> lock(mutex_);
enabled_ = other.enabled();
messages_ = std::move(other.messages_);
return *this;
}
SPDLOG_INLINE void backtracer::enable(size_t size) {
std::lock_guard<std::mutex> lock{mutex_};
enabled_.store(true, std::memory_order_relaxed);
messages_ = circular_q<log_msg_buffer>{size};
}
SPDLOG_INLINE void backtracer::disable() {
std::lock_guard<std::mutex> lock{mutex_};
enabled_.store(false, std::memory_order_relaxed);
}
SPDLOG_INLINE bool backtracer::enabled() const { return enabled_.load(std::memory_order_relaxed); }
SPDLOG_INLINE void backtracer::push_back(const log_msg &msg) {
std::lock_guard<std::mutex> lock{mutex_};
messages_.push_back(log_msg_buffer{msg});
}
SPDLOG_INLINE bool backtracer::empty() const {
std::lock_guard<std::mutex> lock{mutex_};
return messages_.empty();
}
// pop all items in the q and apply the given fun on each of them.
SPDLOG_INLINE void backtracer::foreach_pop(std::function<void(const details::log_msg &)> fun) {
std::lock_guard<std::mutex> lock{mutex_};
while (!messages_.empty()) {
auto &front_msg = messages_.front();
fun(front_msg);
messages_.pop_front();
}
}
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,45 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/details/circular_q.h>
#include <spdlog/details/log_msg_buffer.h>
#include <atomic>
#include <functional>
#include <mutex>
// Store log messages in circular buffer.
// Useful for storing debug data in case of error/warning happens.
namespace spdlog {
namespace details {
class SPDLOG_API backtracer {
mutable std::mutex mutex_;
std::atomic<bool> enabled_{false};
circular_q<log_msg_buffer> messages_;
public:
backtracer() = default;
backtracer(const backtracer &other);
backtracer(backtracer &&other) SPDLOG_NOEXCEPT;
backtracer &operator=(backtracer other);
void enable(size_t size);
void disable();
bool enabled() const;
void push_back(const log_msg &msg);
bool empty() const;
// pop all items in the q and apply the given fun on each of them.
void foreach_pop(std::function<void(const details::log_msg &)> fun);
};
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "backtracer-inl.h"
#endif

View File

@@ -0,0 +1,115 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
// circular q view of std::vector.
#pragma once
#include <cassert>
#include <vector>
#include "spdlog/common.h"
namespace spdlog {
namespace details {
template <typename T>
class circular_q {
size_t max_items_ = 0;
typename std::vector<T>::size_type head_ = 0;
typename std::vector<T>::size_type tail_ = 0;
size_t overrun_counter_ = 0;
std::vector<T> v_;
public:
using value_type = T;
// empty ctor - create a disabled queue with no elements allocated at all
circular_q() = default;
explicit circular_q(size_t max_items)
: max_items_(max_items + 1) // one item is reserved as marker for full q
,
v_(max_items_) {}
circular_q(const circular_q &) = default;
circular_q &operator=(const circular_q &) = default;
// move cannot be default,
// since we need to reset head_, tail_, etc to zero in the moved object
circular_q(circular_q &&other) SPDLOG_NOEXCEPT { copy_moveable(std::move(other)); }
circular_q &operator=(circular_q &&other) SPDLOG_NOEXCEPT {
copy_moveable(std::move(other));
return *this;
}
// push back, overrun (oldest) item if no room left
void push_back(T &&item) {
if (max_items_ > 0) {
v_[tail_] = std::move(item);
tail_ = (tail_ + 1) % max_items_;
if (tail_ == head_) // overrun last item if full
{
head_ = (head_ + 1) % max_items_;
++overrun_counter_;
}
}
}
// Return reference to the front item.
// If there are no elements in the container, the behavior is undefined.
const T &front() const { return v_[head_]; }
T &front() { return v_[head_]; }
// Return number of elements actually stored
size_t size() const {
if (tail_ >= head_) {
return tail_ - head_;
} else {
return max_items_ - (head_ - tail_);
}
}
// Return const reference to item by index.
// If index is out of range 0…size()-1, the behavior is undefined.
const T &at(size_t i) const {
assert(i < size());
return v_[(head_ + i) % max_items_];
}
// Pop item from front.
// If there are no elements in the container, the behavior is undefined.
void pop_front() { head_ = (head_ + 1) % max_items_; }
bool empty() const { return tail_ == head_; }
bool full() const {
// head is ahead of the tail by 1
if (max_items_ > 0) {
return ((tail_ + 1) % max_items_) == head_;
}
return false;
}
size_t overrun_counter() const { return overrun_counter_; }
void reset_overrun_counter() { overrun_counter_ = 0; }
private:
// copy from other&& and reset it to disabled state
void copy_moveable(circular_q &&other) SPDLOG_NOEXCEPT {
max_items_ = other.max_items_;
head_ = other.head_;
tail_ = other.tail_;
overrun_counter_ = other.overrun_counter_;
v_ = std::move(other.v_);
// put &&other in disabled, but valid state
other.max_items_ = 0;
other.head_ = other.tail_ = 0;
other.overrun_counter_ = 0;
}
};
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,28 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <mutex>
#include <spdlog/details/null_mutex.h>
namespace spdlog {
namespace details {
struct console_mutex {
using mutex_t = std::mutex;
static mutex_t &mutex() {
static mutex_t s_mutex;
return s_mutex;
}
};
struct console_nullmutex {
using mutex_t = null_mutex;
static mutex_t &mutex() {
static mutex_t s_mutex;
return s_mutex;
}
};
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,151 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/file_helper.h>
#endif
#include <spdlog/common.h>
#include <spdlog/details/os.h>
#include <cerrno>
#include <cstdio>
#include <string>
#include <tuple>
namespace spdlog {
namespace details {
SPDLOG_INLINE file_helper::file_helper(const file_event_handlers &event_handlers)
: event_handlers_(event_handlers) {}
SPDLOG_INLINE file_helper::~file_helper() { close(); }
SPDLOG_INLINE void file_helper::open(const filename_t &fname, bool truncate) {
close();
filename_ = fname;
auto *mode = SPDLOG_FILENAME_T("ab");
auto *trunc_mode = SPDLOG_FILENAME_T("wb");
if (event_handlers_.before_open) {
event_handlers_.before_open(filename_);
}
for (int tries = 0; tries < open_tries_; ++tries) {
// create containing folder if not exists already.
os::create_dir(os::dir_name(fname));
if (truncate) {
// Truncate by opening-and-closing a tmp file in "wb" mode, always
// opening the actual log-we-write-to in "ab" mode, since that
// interacts more politely with eternal processes that might
// rotate/truncate the file underneath us.
std::FILE *tmp;
if (os::fopen_s(&tmp, fname, trunc_mode)) {
continue;
}
std::fclose(tmp);
}
if (!os::fopen_s(&fd_, fname, mode)) {
if (event_handlers_.after_open) {
event_handlers_.after_open(filename_, fd_);
}
return;
}
details::os::sleep_for_millis(open_interval_);
}
throw_spdlog_ex("Failed opening file " + os::filename_to_str(filename_) + " for writing",
errno);
}
SPDLOG_INLINE void file_helper::reopen(bool truncate) {
if (filename_.empty()) {
throw_spdlog_ex("Failed re opening file - was not opened before");
}
this->open(filename_, truncate);
}
SPDLOG_INLINE void file_helper::flush() {
if (std::fflush(fd_) != 0) {
throw_spdlog_ex("Failed flush to file " + os::filename_to_str(filename_), errno);
}
}
SPDLOG_INLINE void file_helper::sync() {
if (!os::fsync(fd_)) {
throw_spdlog_ex("Failed to fsync file " + os::filename_to_str(filename_), errno);
}
}
SPDLOG_INLINE void file_helper::close() {
if (fd_ != nullptr) {
if (event_handlers_.before_close) {
event_handlers_.before_close(filename_, fd_);
}
std::fclose(fd_);
fd_ = nullptr;
if (event_handlers_.after_close) {
event_handlers_.after_close(filename_);
}
}
}
SPDLOG_INLINE void file_helper::write(const memory_buf_t &buf) {
if (fd_ == nullptr) return;
size_t msg_size = buf.size();
auto data = buf.data();
if (!details::os::fwrite_bytes(data, msg_size, fd_)) {
throw_spdlog_ex("Failed writing to file " + os::filename_to_str(filename_), errno);
}
}
SPDLOG_INLINE size_t file_helper::size() const {
if (fd_ == nullptr) {
throw_spdlog_ex("Cannot use size() on closed file " + os::filename_to_str(filename_));
}
return os::filesize(fd_);
}
SPDLOG_INLINE const filename_t &file_helper::filename() const { return filename_; }
//
// return file path and its extension:
//
// "mylog.txt" => ("mylog", ".txt")
// "mylog" => ("mylog", "")
// "mylog." => ("mylog.", "")
// "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
//
// the starting dot in filenames is ignored (hidden files):
//
// ".mylog" => (".mylog". "")
// "my_folder/.mylog" => ("my_folder/.mylog", "")
// "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
SPDLOG_INLINE std::tuple<filename_t, filename_t> file_helper::split_by_extension(
const filename_t &fname) {
auto ext_index = fname.rfind('.');
// no valid extension found - return whole path and empty string as
// extension
if (ext_index == filename_t::npos || ext_index == 0 || ext_index == fname.size() - 1) {
return std::make_tuple(fname, filename_t());
}
// treat cases like "/etc/rc.d/somelogfile or "/abc/.hiddenfile"
auto folder_index = fname.find_last_of(details::os::folder_seps_filename);
if (folder_index != filename_t::npos && folder_index >= ext_index - 1) {
return std::make_tuple(fname, filename_t());
}
// finally - return a valid base and extension tuple
return std::make_tuple(fname.substr(0, ext_index), fname.substr(ext_index));
}
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,61 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/common.h>
#include <tuple>
namespace spdlog {
namespace details {
// Helper class for file sinks.
// When failing to open a file, retry several times(5) with a delay interval(10 ms).
// Throw spdlog_ex exception on errors.
class SPDLOG_API file_helper {
public:
file_helper() = default;
explicit file_helper(const file_event_handlers &event_handlers);
file_helper(const file_helper &) = delete;
file_helper &operator=(const file_helper &) = delete;
~file_helper();
void open(const filename_t &fname, bool truncate = false);
void reopen(bool truncate);
void flush();
void sync();
void close();
void write(const memory_buf_t &buf);
size_t size() const;
const filename_t &filename() const;
//
// return file path and its extension:
//
// "mylog.txt" => ("mylog", ".txt")
// "mylog" => ("mylog", "")
// "mylog." => ("mylog.", "")
// "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
//
// the starting dot in filenames is ignored (hidden files):
//
// ".mylog" => (".mylog". "")
// "my_folder/.mylog" => ("my_folder/.mylog", "")
// "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
static std::tuple<filename_t, filename_t> split_by_extension(const filename_t &fname);
private:
const int open_tries_ = 5;
const unsigned int open_interval_ = 10;
std::FILE *fd_{nullptr};
filename_t filename_;
file_event_handlers event_handlers_;
};
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "file_helper-inl.h"
#endif

View File

@@ -0,0 +1,141 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <chrono>
#include <iterator>
#include <spdlog/common.h>
#include <spdlog/fmt/fmt.h>
#include <type_traits>
#ifdef SPDLOG_USE_STD_FORMAT
#include <charconv>
#include <limits>
#endif
// Some fmt helpers to efficiently format and pad ints and strings
namespace spdlog {
namespace details {
namespace fmt_helper {
inline void append_string_view(spdlog::string_view_t view, memory_buf_t &dest) {
auto *buf_ptr = view.data();
dest.append(buf_ptr, buf_ptr + view.size());
}
#ifdef SPDLOG_USE_STD_FORMAT
template <typename T>
inline void append_int(T n, memory_buf_t &dest) {
// Buffer should be large enough to hold all digits (digits10 + 1) and a sign
SPDLOG_CONSTEXPR const auto BUF_SIZE = std::numeric_limits<T>::digits10 + 2;
char buf[BUF_SIZE];
auto [ptr, ec] = std::to_chars(buf, buf + BUF_SIZE, n, 10);
if (ec == std::errc()) {
dest.append(buf, ptr);
} else {
throw_spdlog_ex("Failed to format int", static_cast<int>(ec));
}
}
#else
template <typename T>
inline void append_int(T n, memory_buf_t &dest) {
fmt::format_int i(n);
dest.append(i.data(), i.data() + i.size());
}
#endif
template <typename T>
SPDLOG_CONSTEXPR_FUNC unsigned int count_digits_fallback(T n) {
// taken from fmt: https://github.com/fmtlib/fmt/blob/8.0.1/include/fmt/format.h#L899-L912
unsigned int count = 1;
for (;;) {
// Integer division is slow so do it for a group of four digits instead
// of for every digit. The idea comes from the talk by Alexandrescu
// "Three Optimization Tips for C++". See speed-test for a comparison.
if (n < 10) return count;
if (n < 100) return count + 1;
if (n < 1000) return count + 2;
if (n < 10000) return count + 3;
n /= 10000u;
count += 4;
}
}
template <typename T>
inline unsigned int count_digits(T n) {
using count_type =
typename std::conditional<(sizeof(T) > sizeof(uint32_t)), uint64_t, uint32_t>::type;
#ifdef SPDLOG_USE_STD_FORMAT
return count_digits_fallback(static_cast<count_type>(n));
#else
return static_cast<unsigned int>(fmt::
// fmt 7.0.0 renamed the internal namespace to detail.
// See: https://github.com/fmtlib/fmt/issues/1538
#if FMT_VERSION < 70000
internal
#else
detail
#endif
::count_digits(static_cast<count_type>(n)));
#endif
}
inline void pad2(int n, memory_buf_t &dest) {
if (n >= 0 && n < 100) // 0-99
{
dest.push_back(static_cast<char>('0' + n / 10));
dest.push_back(static_cast<char>('0' + n % 10));
} else // unlikely, but just in case, let fmt deal with it
{
fmt_lib::format_to(std::back_inserter(dest), SPDLOG_FMT_STRING("{:02}"), n);
}
}
template <typename T>
inline void pad_uint(T n, unsigned int width, memory_buf_t &dest) {
static_assert(std::is_unsigned<T>::value, "pad_uint must get unsigned T");
for (auto digits = count_digits(n); digits < width; digits++) {
dest.push_back('0');
}
append_int(n, dest);
}
template <typename T>
inline void pad3(T n, memory_buf_t &dest) {
static_assert(std::is_unsigned<T>::value, "pad3 must get unsigned T");
if (n < 1000) {
dest.push_back(static_cast<char>(n / 100 + '0'));
n = n % 100;
dest.push_back(static_cast<char>((n / 10) + '0'));
dest.push_back(static_cast<char>((n % 10) + '0'));
} else {
append_int(n, dest);
}
}
template <typename T>
inline void pad6(T n, memory_buf_t &dest) {
pad_uint(n, 6, dest);
}
template <typename T>
inline void pad9(T n, memory_buf_t &dest) {
pad_uint(n, 9, dest);
}
// return fraction of a second of the given time_point.
// e.g.
// fraction<std::milliseconds>(tp) -> will return the millis part of the second
template <typename ToDuration>
inline ToDuration time_fraction(log_clock::time_point tp) {
using std::chrono::duration_cast;
using std::chrono::seconds;
auto duration = tp.time_since_epoch();
auto secs = duration_cast<seconds>(duration);
return duration_cast<ToDuration>(duration) - duration_cast<ToDuration>(secs);
}
} // namespace fmt_helper
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,44 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/log_msg.h>
#endif
#include <spdlog/details/os.h>
namespace spdlog {
namespace details {
SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time,
spdlog::source_loc loc,
string_view_t a_logger_name,
spdlog::level::level_enum lvl,
spdlog::string_view_t msg)
: logger_name(a_logger_name),
level(lvl),
time(log_time)
#ifndef SPDLOG_NO_THREAD_ID
,
thread_id(os::thread_id())
#endif
,
source(loc),
payload(msg) {
}
SPDLOG_INLINE log_msg::log_msg(spdlog::source_loc loc,
string_view_t a_logger_name,
spdlog::level::level_enum lvl,
spdlog::string_view_t msg)
: log_msg(os::now(), loc, a_logger_name, lvl, msg) {}
SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name,
spdlog::level::level_enum lvl,
spdlog::string_view_t msg)
: log_msg(os::now(), source_loc{}, a_logger_name, lvl, msg) {}
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,40 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/common.h>
#include <string>
namespace spdlog {
namespace details {
struct SPDLOG_API log_msg {
log_msg() = default;
log_msg(log_clock::time_point log_time,
source_loc loc,
string_view_t logger_name,
level::level_enum lvl,
string_view_t msg);
log_msg(source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg);
log_msg(string_view_t logger_name, level::level_enum lvl, string_view_t msg);
log_msg(const log_msg &other) = default;
log_msg &operator=(const log_msg &other) = default;
string_view_t logger_name;
level::level_enum level{level::off};
log_clock::time_point time;
size_t thread_id{0};
// wrapping the formatted text with color (updated by pattern_formatter).
mutable size_t color_range_start{0};
mutable size_t color_range_end{0};
source_loc source;
string_view_t payload;
};
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "log_msg-inl.h"
#endif

View File

@@ -0,0 +1,54 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/log_msg_buffer.h>
#endif
namespace spdlog {
namespace details {
SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg &orig_msg)
: log_msg{orig_msg} {
buffer.append(logger_name.begin(), logger_name.end());
buffer.append(payload.begin(), payload.end());
update_string_views();
}
SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg_buffer &other)
: log_msg{other} {
buffer.append(logger_name.begin(), logger_name.end());
buffer.append(payload.begin(), payload.end());
update_string_views();
}
SPDLOG_INLINE log_msg_buffer::log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT
: log_msg{other},
buffer{std::move(other.buffer)} {
update_string_views();
}
SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(const log_msg_buffer &other) {
log_msg::operator=(other);
buffer.clear();
buffer.append(other.buffer.data(), other.buffer.data() + other.buffer.size());
update_string_views();
return *this;
}
SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT {
log_msg::operator=(other);
buffer = std::move(other.buffer);
update_string_views();
return *this;
}
SPDLOG_INLINE void log_msg_buffer::update_string_views() {
logger_name = string_view_t{buffer.data(), logger_name.size()};
payload = string_view_t{buffer.data() + logger_name.size(), payload.size()};
}
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,32 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <spdlog/details/log_msg.h>
namespace spdlog {
namespace details {
// Extend log_msg with internal buffer to store its payload.
// This is needed since log_msg holds string_views that points to stack data.
class SPDLOG_API log_msg_buffer : public log_msg {
memory_buf_t buffer;
void update_string_views();
public:
log_msg_buffer() = default;
explicit log_msg_buffer(const log_msg &orig_msg);
log_msg_buffer(const log_msg_buffer &other);
log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
log_msg_buffer &operator=(const log_msg_buffer &other);
log_msg_buffer &operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
};
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "log_msg_buffer-inl.h"
#endif

View File

@@ -0,0 +1,177 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
// multi producer-multi consumer blocking queue.
// enqueue(..) - will block until room found to put the new message.
// enqueue_nowait(..) - enqueue immediately. overruns oldest message if no
// room left.
// dequeue_for(..) - will block until the queue is not empty or timeout have
// passed.
#include <spdlog/details/circular_q.h>
#include <atomic>
#include <condition_variable>
#include <mutex>
namespace spdlog {
namespace details {
template <typename T>
class mpmc_blocking_queue {
public:
using item_type = T;
explicit mpmc_blocking_queue(size_t max_items)
: q_(max_items) {}
#ifndef __MINGW32__
// try to enqueue and block if no room left
void enqueue(T &&item) {
{
std::unique_lock<std::mutex> lock(queue_mutex_);
pop_cv_.wait(lock, [this] { return !this->q_.full(); });
q_.push_back(std::move(item));
}
push_cv_.notify_one();
}
// enqueue immediately. overrun oldest message in the queue if no room left.
void enqueue_nowait(T &&item) {
{
std::unique_lock<std::mutex> lock(queue_mutex_);
q_.push_back(std::move(item));
}
push_cv_.notify_one();
}
void enqueue_if_have_room(T &&item) {
bool pushed = false;
{
std::unique_lock<std::mutex> lock(queue_mutex_);
if (!q_.full()) {
q_.push_back(std::move(item));
pushed = true;
}
}
if (pushed) {
push_cv_.notify_one();
} else {
++discard_counter_;
}
}
// dequeue with a timeout.
// Return true, if succeeded dequeue item, false otherwise
bool dequeue_for(T &popped_item, std::chrono::milliseconds wait_duration) {
{
std::unique_lock<std::mutex> lock(queue_mutex_);
if (!push_cv_.wait_for(lock, wait_duration, [this] { return !this->q_.empty(); })) {
return false;
}
popped_item = std::move(q_.front());
q_.pop_front();
}
pop_cv_.notify_one();
return true;
}
// blocking dequeue without a timeout.
void dequeue(T &popped_item) {
{
std::unique_lock<std::mutex> lock(queue_mutex_);
push_cv_.wait(lock, [this] { return !this->q_.empty(); });
popped_item = std::move(q_.front());
q_.pop_front();
}
pop_cv_.notify_one();
}
#else
// apparently mingw deadlocks if the mutex is released before cv.notify_one(),
// so release the mutex at the very end each function.
// try to enqueue and block if no room left
void enqueue(T &&item) {
std::unique_lock<std::mutex> lock(queue_mutex_);
pop_cv_.wait(lock, [this] { return !this->q_.full(); });
q_.push_back(std::move(item));
push_cv_.notify_one();
}
// enqueue immediately. overrun oldest message in the queue if no room left.
void enqueue_nowait(T &&item) {
std::unique_lock<std::mutex> lock(queue_mutex_);
q_.push_back(std::move(item));
push_cv_.notify_one();
}
void enqueue_if_have_room(T &&item) {
bool pushed = false;
std::unique_lock<std::mutex> lock(queue_mutex_);
if (!q_.full()) {
q_.push_back(std::move(item));
pushed = true;
}
if (pushed) {
push_cv_.notify_one();
} else {
++discard_counter_;
}
}
// dequeue with a timeout.
// Return true, if succeeded dequeue item, false otherwise
bool dequeue_for(T &popped_item, std::chrono::milliseconds wait_duration) {
std::unique_lock<std::mutex> lock(queue_mutex_);
if (!push_cv_.wait_for(lock, wait_duration, [this] { return !this->q_.empty(); })) {
return false;
}
popped_item = std::move(q_.front());
q_.pop_front();
pop_cv_.notify_one();
return true;
}
// blocking dequeue without a timeout.
void dequeue(T &popped_item) {
std::unique_lock<std::mutex> lock(queue_mutex_);
push_cv_.wait(lock, [this] { return !this->q_.empty(); });
popped_item = std::move(q_.front());
q_.pop_front();
pop_cv_.notify_one();
}
#endif
size_t overrun_counter() {
std::lock_guard<std::mutex> lock(queue_mutex_);
return q_.overrun_counter();
}
size_t discard_counter() { return discard_counter_.load(std::memory_order_relaxed); }
size_t size() {
std::lock_guard<std::mutex> lock(queue_mutex_);
return q_.size();
}
void reset_overrun_counter() {
std::lock_guard<std::mutex> lock(queue_mutex_);
q_.reset_overrun_counter();
}
void reset_discard_counter() { discard_counter_.store(0, std::memory_order_relaxed); }
private:
std::mutex queue_mutex_;
std::condition_variable push_cv_;
std::condition_variable pop_cv_;
spdlog::details::circular_q<T> q_;
std::atomic<size_t> discard_counter_{0};
};
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,35 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <atomic>
#include <utility>
// null, no cost dummy "mutex" and dummy "atomic" int
namespace spdlog {
namespace details {
struct null_mutex {
void lock() const {}
void unlock() const {}
};
struct null_atomic_int {
int value;
null_atomic_int() = default;
explicit null_atomic_int(int new_value)
: value(new_value) {}
int load(std::memory_order = std::memory_order_relaxed) const { return value; }
void store(int new_value, std::memory_order = std::memory_order_relaxed) { value = new_value; }
int exchange(int new_value, std::memory_order = std::memory_order_relaxed) {
std::swap(new_value, value);
return new_value; // return value before the call
}
};
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,572 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/os.h>
#endif
#include <spdlog/common.h>
#include <algorithm>
#include <array>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <thread>
#ifdef _WIN32
#include <spdlog/details/windows_include.h>
#include <io.h> // for _get_osfhandle, _isatty, _fileno
#include <process.h> // for _get_pid
#ifdef __MINGW32__
#include <share.h>
#endif
#if defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)
#include <cassert>
#include <limits>
#endif
#include <direct.h> // for _mkdir/_wmkdir
#else // unix
#include <fcntl.h>
#include <unistd.h>
#ifdef __linux__
#include <sys/syscall.h> //Use gettid() syscall under linux to get thread id
#elif defined(_AIX)
#include <pthread.h> // for pthread_getthrds_np
#elif defined(__DragonFly__) || defined(__FreeBSD__)
#include <pthread_np.h> // for pthread_getthreadid_np
#elif defined(__NetBSD__)
#include <lwp.h> // for _lwp_self
#elif defined(__sun)
#include <thread.h> // for thr_self
#endif
#endif // unix
#if defined __APPLE__
#include <AvailabilityMacros.h>
#endif
#ifndef __has_feature // Clang - feature checking macros.
#define __has_feature(x) 0 // Compatibility with non-clang compilers.
#endif
namespace spdlog {
namespace details {
namespace os {
SPDLOG_INLINE spdlog::log_clock::time_point now() SPDLOG_NOEXCEPT {
#if defined __linux__ && defined SPDLOG_CLOCK_COARSE
timespec ts;
::clock_gettime(CLOCK_REALTIME_COARSE, &ts);
return std::chrono::time_point<log_clock, typename log_clock::duration>(
std::chrono::duration_cast<typename log_clock::duration>(
std::chrono::seconds(ts.tv_sec) + std::chrono::nanoseconds(ts.tv_nsec)));
#else
return log_clock::now();
#endif
}
SPDLOG_INLINE std::tm localtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
#ifdef _WIN32
std::tm tm;
::localtime_s(&tm, &time_tt);
#else
std::tm tm;
::localtime_r(&time_tt, &tm);
#endif
return tm;
}
SPDLOG_INLINE std::tm localtime() SPDLOG_NOEXCEPT {
std::time_t now_t = ::time(nullptr);
return localtime(now_t);
}
SPDLOG_INLINE std::tm gmtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
#ifdef _WIN32
std::tm tm;
::gmtime_s(&tm, &time_tt);
#else
std::tm tm;
::gmtime_r(&time_tt, &tm);
#endif
return tm;
}
SPDLOG_INLINE std::tm gmtime() SPDLOG_NOEXCEPT {
std::time_t now_t = ::time(nullptr);
return gmtime(now_t);
}
// fopen_s on non windows for writing
SPDLOG_INLINE bool fopen_s(FILE **fp, const filename_t &filename, const filename_t &mode) {
#ifdef _WIN32
#ifdef SPDLOG_WCHAR_FILENAMES
*fp = ::_wfsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
#else
*fp = ::_fsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
#endif
#if defined(SPDLOG_PREVENT_CHILD_FD)
if (*fp != nullptr) {
auto file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(::_fileno(*fp)));
if (!::SetHandleInformation(file_handle, HANDLE_FLAG_INHERIT, 0)) {
::fclose(*fp);
*fp = nullptr;
}
}
#endif
#else // unix
#if defined(SPDLOG_PREVENT_CHILD_FD)
const int mode_flag = mode == SPDLOG_FILENAME_T("ab") ? O_APPEND : O_TRUNC;
const int fd =
::open((filename.c_str()), O_CREAT | O_WRONLY | O_CLOEXEC | mode_flag, mode_t(0644));
if (fd == -1) {
return true;
}
*fp = ::fdopen(fd, mode.c_str());
if (*fp == nullptr) {
::close(fd);
}
#else
*fp = ::fopen((filename.c_str()), mode.c_str());
#endif
#endif
return *fp == nullptr;
}
SPDLOG_INLINE int remove(const filename_t &filename) SPDLOG_NOEXCEPT {
#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
return ::_wremove(filename.c_str());
#else
return std::remove(filename.c_str());
#endif
}
SPDLOG_INLINE int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
return path_exists(filename) ? remove(filename) : 0;
}
SPDLOG_INLINE int rename(const filename_t &filename1, const filename_t &filename2) SPDLOG_NOEXCEPT {
#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
return ::_wrename(filename1.c_str(), filename2.c_str());
#else
return std::rename(filename1.c_str(), filename2.c_str());
#endif
}
// Return true if path exists (file or directory)
SPDLOG_INLINE bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
#ifdef _WIN32
struct _stat buffer;
#ifdef SPDLOG_WCHAR_FILENAMES
return (::_wstat(filename.c_str(), &buffer) == 0);
#else
return (::_stat(filename.c_str(), &buffer) == 0);
#endif
#else // common linux/unix all have the stat system call
struct stat buffer;
return (::stat(filename.c_str(), &buffer) == 0);
#endif
}
#ifdef _MSC_VER
// avoid warning about unreachable statement at the end of filesize()
#pragma warning(push)
#pragma warning(disable : 4702)
#endif
// Return file size according to open FILE* object
SPDLOG_INLINE size_t filesize(FILE *f) {
if (f == nullptr) {
throw_spdlog_ex("Failed getting file size. fd is null");
}
#if defined(_WIN32) && !defined(__CYGWIN__)
int fd = ::_fileno(f);
#if defined(_WIN64) // 64 bits
__int64 ret = ::_filelengthi64(fd);
if (ret >= 0) {
return static_cast<size_t>(ret);
}
#else // windows 32 bits
long ret = ::_filelength(fd);
if (ret >= 0) {
return static_cast<size_t>(ret);
}
#endif
#else // unix
// OpenBSD and AIX doesn't compile with :: before the fileno(..)
#if defined(__OpenBSD__) || defined(_AIX)
int fd = fileno(f);
#else
int fd = ::fileno(f);
#endif
// 64 bits(but not in osx, linux/musl or cygwin, where fstat64 is deprecated)
#if ((defined(__linux__) && defined(__GLIBC__)) || defined(__sun) || defined(_AIX)) && \
(defined(__LP64__) || defined(_LP64))
struct stat64 st;
if (::fstat64(fd, &st) == 0) {
return static_cast<size_t>(st.st_size);
}
#else // other unix or linux 32 bits or cygwin
struct stat st;
if (::fstat(fd, &st) == 0) {
return static_cast<size_t>(st.st_size);
}
#endif
#endif
throw_spdlog_ex("Failed getting file size from fd", errno);
return 0; // will not be reached.
}
#ifdef _MSC_VER
#pragma warning(pop)
#endif
// Return utc offset in minutes or throw spdlog_ex on failure
#if !defined(SPDLOG_NO_TZ_OFFSET)
SPDLOG_INLINE int utc_minutes_offset(const std::tm &tm) {
#ifdef _WIN32
#if _WIN32_WINNT < _WIN32_WINNT_WS08
TIME_ZONE_INFORMATION tzinfo;
auto rv = ::GetTimeZoneInformation(&tzinfo);
#else
DYNAMIC_TIME_ZONE_INFORMATION tzinfo;
auto rv = ::GetDynamicTimeZoneInformation(&tzinfo);
#endif
if (rv == TIME_ZONE_ID_INVALID) throw_spdlog_ex("Failed getting timezone info. ", errno);
int offset = -tzinfo.Bias;
if (tm.tm_isdst) {
offset -= tzinfo.DaylightBias;
} else {
offset -= tzinfo.StandardBias;
}
return offset;
#else
auto offset_seconds = tm.tm_gmtoff;
return static_cast<int>(offset_seconds / 60);
#endif
}
#endif // SPDLOG_NO_TZ_OFFSET
// Return current thread id as size_t
// It exists because the std::this_thread::get_id() is much slower(especially
// under VS 2013)
SPDLOG_INLINE size_t _thread_id() SPDLOG_NOEXCEPT {
#ifdef _WIN32
return static_cast<size_t>(::GetCurrentThreadId());
#elif defined(__linux__)
#if defined(__ANDROID__) && defined(__ANDROID_API__) && (__ANDROID_API__ < 21)
#define SYS_gettid __NR_gettid
#endif
return static_cast<size_t>(::syscall(SYS_gettid));
#elif defined(_AIX)
struct __pthrdsinfo buf;
int reg_size = 0;
pthread_t pt = pthread_self();
int retval = pthread_getthrds_np(&pt, PTHRDSINFO_QUERY_TID, &buf, sizeof(buf), NULL, &reg_size);
int tid = (!retval) ? buf.__pi_tid : 0;
return static_cast<size_t>(tid);
#elif defined(__DragonFly__) || defined(__FreeBSD__)
return static_cast<size_t>(::pthread_getthreadid_np());
#elif defined(__NetBSD__)
return static_cast<size_t>(::_lwp_self());
#elif defined(__OpenBSD__)
return static_cast<size_t>(::getthrid());
#elif defined(__sun)
return static_cast<size_t>(::thr_self());
#elif __APPLE__
uint64_t tid;
// There is no pthread_threadid_np prior to Mac OS X 10.6, and it is not supported on any PPC,
// including 10.6.8 Rosetta. __POWERPC__ is Apple-specific define encompassing ppc and ppc64.
#ifdef MAC_OS_X_VERSION_MAX_ALLOWED
{
#if (MAC_OS_X_VERSION_MAX_ALLOWED < 1060) || defined(__POWERPC__)
tid = pthread_mach_thread_np(pthread_self());
#elif MAC_OS_X_VERSION_MIN_REQUIRED < 1060
if (&pthread_threadid_np) {
pthread_threadid_np(nullptr, &tid);
} else {
tid = pthread_mach_thread_np(pthread_self());
}
#else
pthread_threadid_np(nullptr, &tid);
#endif
}
#else
pthread_threadid_np(nullptr, &tid);
#endif
return static_cast<size_t>(tid);
#else // Default to standard C++11 (other Unix)
return static_cast<size_t>(std::hash<std::thread::id>()(std::this_thread::get_id()));
#endif
}
// Return current thread id as size_t (from thread local storage)
SPDLOG_INLINE size_t thread_id() SPDLOG_NOEXCEPT {
#if defined(SPDLOG_NO_TLS)
return _thread_id();
#else // cache thread id in tls
static thread_local const size_t tid = _thread_id();
return tid;
#endif
}
// This is avoid msvc issue in sleep_for that happens if the clock changes.
// See https://github.com/gabime/spdlog/issues/609
SPDLOG_INLINE void sleep_for_millis(unsigned int milliseconds) SPDLOG_NOEXCEPT {
#if defined(_WIN32)
::Sleep(milliseconds);
#else
std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
#endif
}
// wchar support for windows file names (SPDLOG_WCHAR_FILENAMES must be defined)
#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) {
memory_buf_t buf;
wstr_to_utf8buf(filename, buf);
return SPDLOG_BUF_TO_STRING(buf);
}
#else
SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) { return filename; }
#endif
SPDLOG_INLINE int pid() SPDLOG_NOEXCEPT {
#ifdef _WIN32
return conditional_static_cast<int>(::GetCurrentProcessId());
#else
return conditional_static_cast<int>(::getpid());
#endif
}
// Determine if the terminal supports colors
// Based on: https://github.com/agauniyal/rang/
SPDLOG_INLINE bool is_color_terminal() SPDLOG_NOEXCEPT {
#ifdef _WIN32
return true;
#else
static const bool result = []() {
const char *env_colorterm_p = std::getenv("COLORTERM");
if (env_colorterm_p != nullptr) {
return true;
}
static constexpr std::array<const char *, 16> terms = {
{"ansi", "color", "console", "cygwin", "gnome", "konsole", "kterm", "linux", "msys",
"putty", "rxvt", "screen", "vt100", "xterm", "alacritty", "vt102"}};
const char *env_term_p = std::getenv("TERM");
if (env_term_p == nullptr) {
return false;
}
return std::any_of(terms.begin(), terms.end(), [&](const char *term) {
return std::strstr(env_term_p, term) != nullptr;
});
}();
return result;
#endif
}
// Determine if the terminal attached
// Source: https://github.com/agauniyal/rang/
SPDLOG_INLINE bool in_terminal(FILE *file) SPDLOG_NOEXCEPT {
#ifdef _WIN32
return ::_isatty(_fileno(file)) != 0;
#else
return ::isatty(fileno(file)) != 0;
#endif
}
#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) && defined(_WIN32)
SPDLOG_INLINE void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target) {
if (wstr.size() > static_cast<size_t>((std::numeric_limits<int>::max)()) / 4 - 1) {
throw_spdlog_ex("UTF-16 string is too big to be converted to UTF-8");
}
int wstr_size = static_cast<int>(wstr.size());
if (wstr_size == 0) {
target.resize(0);
return;
}
int result_size = static_cast<int>(target.capacity());
if ((wstr_size + 1) * 4 > result_size) {
result_size =
::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size, NULL, 0, NULL, NULL);
}
if (result_size > 0) {
target.resize(result_size);
result_size = ::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size, target.data(),
result_size, NULL, NULL);
if (result_size > 0) {
target.resize(result_size);
return;
}
}
throw_spdlog_ex(
fmt_lib::format("WideCharToMultiByte failed. Last error: {}", ::GetLastError()));
}
SPDLOG_INLINE void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target) {
if (str.size() > static_cast<size_t>((std::numeric_limits<int>::max)()) - 1) {
throw_spdlog_ex("UTF-8 string is too big to be converted to UTF-16");
}
int str_size = static_cast<int>(str.size());
if (str_size == 0) {
target.resize(0);
return;
}
// find the size to allocate for the result buffer
int result_size = ::MultiByteToWideChar(CP_UTF8, 0, str.data(), str_size, NULL, 0);
if (result_size > 0) {
target.resize(result_size);
result_size =
::MultiByteToWideChar(CP_UTF8, 0, str.data(), str_size, target.data(), result_size);
if (result_size > 0) {
assert(result_size == static_cast<int>(target.size()));
return;
}
}
throw_spdlog_ex(
fmt_lib::format("MultiByteToWideChar failed. Last error: {}", ::GetLastError()));
}
#endif // (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) &&
// defined(_WIN32)
// return true on success
static SPDLOG_INLINE bool mkdir_(const filename_t &path) {
#ifdef _WIN32
#ifdef SPDLOG_WCHAR_FILENAMES
return ::_wmkdir(path.c_str()) == 0;
#else
return ::_mkdir(path.c_str()) == 0;
#endif
#else
return ::mkdir(path.c_str(), mode_t(0755)) == 0;
#endif
}
// create the given directory - and all directories leading to it
// return true on success or if the directory already exists
SPDLOG_INLINE bool create_dir(const filename_t &path) {
if (path_exists(path)) {
return true;
}
if (path.empty()) {
return false;
}
size_t search_offset = 0;
do {
auto token_pos = path.find_first_of(folder_seps_filename, search_offset);
// treat the entire path as a folder if no folder separator not found
if (token_pos == filename_t::npos) {
token_pos = path.size();
}
auto subdir = path.substr(0, token_pos);
#ifdef _WIN32
// if subdir is just a drive letter, add a slash e.g. "c:"=>"c:\",
// otherwise path_exists(subdir) returns false (issue #3079)
const bool is_drive = subdir.length() == 2 && subdir[1] == ':';
if (is_drive) {
subdir += '\\';
token_pos++;
}
#endif
if (!subdir.empty() && !path_exists(subdir) && !mkdir_(subdir)) {
return false; // return error if failed creating dir
}
search_offset = token_pos + 1;
} while (search_offset < path.size());
return true;
}
// Return directory name from given path or empty string
// "abc/file" => "abc"
// "abc/" => "abc"
// "abc" => ""
// "abc///" => "abc//"
SPDLOG_INLINE filename_t dir_name(const filename_t &path) {
auto pos = path.find_last_of(folder_seps_filename);
return pos != filename_t::npos ? path.substr(0, pos) : filename_t{};
}
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4996)
#endif // _MSC_VER
std::string SPDLOG_INLINE getenv(const char *field) {
#if defined(_MSC_VER) && defined(WINAPI_FAMILY) && defined(WINAPI_FAMILY_DESKTOP_APP) && \
(WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP)
return std::string{}; // not supported under uwp
#else
char *buf = std::getenv(field);
return buf ? buf : std::string{};
#endif
}
#ifdef _MSC_VER
#pragma warning(pop)
#endif // _MSC_VER
// Do fsync by FILE handlerpointer
// Return true on success
SPDLOG_INLINE bool fsync(FILE *fp) {
#ifdef _WIN32
return FlushFileBuffers(reinterpret_cast<HANDLE>(_get_osfhandle(_fileno(fp)))) != 0;
#else
return ::fsync(fileno(fp)) == 0;
#endif
}
// Do non-locking fwrite if possible by the os or use the regular locking fwrite
// Return true on success.
SPDLOG_INLINE bool fwrite_bytes(const void *ptr, const size_t n_bytes, FILE *fp) {
#if defined(_WIN32) && defined(SPDLOG_FWRITE_UNLOCKED)
return _fwrite_nolock(ptr, 1, n_bytes, fp) == n_bytes;
#elif defined(SPDLOG_FWRITE_UNLOCKED)
return ::fwrite_unlocked(ptr, 1, n_bytes, fp) == n_bytes;
#else
return std::fwrite(ptr, 1, n_bytes, fp) == n_bytes;
#endif
}
} // namespace os
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,127 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include <ctime> // std::time_t
#include <spdlog/common.h>
namespace spdlog {
namespace details {
namespace os {
SPDLOG_API spdlog::log_clock::time_point now() SPDLOG_NOEXCEPT;
SPDLOG_API std::tm localtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT;
SPDLOG_API std::tm localtime() SPDLOG_NOEXCEPT;
SPDLOG_API std::tm gmtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT;
SPDLOG_API std::tm gmtime() SPDLOG_NOEXCEPT;
// eol definition
#if !defined(SPDLOG_EOL)
#ifdef _WIN32
#define SPDLOG_EOL "\r\n"
#else
#define SPDLOG_EOL "\n"
#endif
#endif
SPDLOG_CONSTEXPR static const char *default_eol = SPDLOG_EOL;
// folder separator
#if !defined(SPDLOG_FOLDER_SEPS)
#ifdef _WIN32
#define SPDLOG_FOLDER_SEPS "\\/"
#else
#define SPDLOG_FOLDER_SEPS "/"
#endif
#endif
SPDLOG_CONSTEXPR static const char folder_seps[] = SPDLOG_FOLDER_SEPS;
SPDLOG_CONSTEXPR static const filename_t::value_type folder_seps_filename[] =
SPDLOG_FILENAME_T(SPDLOG_FOLDER_SEPS);
// fopen_s on non windows for writing
SPDLOG_API bool fopen_s(FILE **fp, const filename_t &filename, const filename_t &mode);
// Remove filename. return 0 on success
SPDLOG_API int remove(const filename_t &filename) SPDLOG_NOEXCEPT;
// Remove file if exists. return 0 on success
// Note: Non atomic (might return failure to delete if concurrently deleted by other process/thread)
SPDLOG_API int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT;
SPDLOG_API int rename(const filename_t &filename1, const filename_t &filename2) SPDLOG_NOEXCEPT;
// Return if file exists.
SPDLOG_API bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT;
// Return file size according to open FILE* object
SPDLOG_API size_t filesize(FILE *f);
// Return utc offset in minutes or throw spdlog_ex on failure
SPDLOG_API int utc_minutes_offset(const std::tm &tm = details::os::localtime());
// Return current thread id as size_t
// It exists because the std::this_thread::get_id() is much slower(especially
// under VS 2013)
SPDLOG_API size_t _thread_id() SPDLOG_NOEXCEPT;
// Return current thread id as size_t (from thread local storage)
SPDLOG_API size_t thread_id() SPDLOG_NOEXCEPT;
// This is avoid msvc issue in sleep_for that happens if the clock changes.
// See https://github.com/gabime/spdlog/issues/609
SPDLOG_API void sleep_for_millis(unsigned int milliseconds) SPDLOG_NOEXCEPT;
SPDLOG_API std::string filename_to_str(const filename_t &filename);
SPDLOG_API int pid() SPDLOG_NOEXCEPT;
// Determine if the terminal supports colors
// Source: https://github.com/agauniyal/rang/
SPDLOG_API bool is_color_terminal() SPDLOG_NOEXCEPT;
// Determine if the terminal attached
// Source: https://github.com/agauniyal/rang/
SPDLOG_API bool in_terminal(FILE *file) SPDLOG_NOEXCEPT;
#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) && defined(_WIN32)
SPDLOG_API void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target);
SPDLOG_API void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target);
#endif
// Return directory name from given path or empty string
// "abc/file" => "abc"
// "abc/" => "abc"
// "abc" => ""
// "abc///" => "abc//"
SPDLOG_API filename_t dir_name(const filename_t &path);
// Create a dir from the given path.
// Return true if succeeded or if this dir already exists.
SPDLOG_API bool create_dir(const filename_t &path);
// non thread safe, cross platform getenv/getenv_s
// return empty string if field not found
SPDLOG_API std::string getenv(const char *field);
// Do fsync by FILE objectpointer.
// Return true on success.
SPDLOG_API bool fsync(FILE *fp);
// Do non-locking fwrite if possible by the os or use the regular locking fwrite
// Return true on success.
SPDLOG_API bool fwrite_bytes(const void *ptr, const size_t n_bytes, FILE *fp);
} // namespace os
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "os-inl.h"
#endif

View File

@@ -0,0 +1,26 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/periodic_worker.h>
#endif
namespace spdlog {
namespace details {
// stop the worker thread and join it
SPDLOG_INLINE periodic_worker::~periodic_worker() {
if (worker_thread_.joinable()) {
{
std::lock_guard<std::mutex> lock(mutex_);
active_ = false;
}
cv_.notify_one();
worker_thread_.join();
}
}
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,58 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
// periodic worker thread - periodically executes the given callback function.
//
// RAII over the owned thread:
// creates the thread on construction.
// stops and joins the thread on destruction (if the thread is executing a callback, wait for it
// to finish first).
#include <chrono>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <thread>
namespace spdlog {
namespace details {
class SPDLOG_API periodic_worker {
public:
template <typename Rep, typename Period>
periodic_worker(const std::function<void()> &callback_fun,
std::chrono::duration<Rep, Period> interval) {
active_ = (interval > std::chrono::duration<Rep, Period>::zero());
if (!active_) {
return;
}
worker_thread_ = std::thread([this, callback_fun, interval]() {
for (;;) {
std::unique_lock<std::mutex> lock(this->mutex_);
if (this->cv_.wait_for(lock, interval, [this] { return !this->active_; })) {
return; // active_ == false, so exit this thread
}
callback_fun();
}
});
}
std::thread &get_thread() { return worker_thread_; }
periodic_worker(const periodic_worker &) = delete;
periodic_worker &operator=(const periodic_worker &) = delete;
// stop the worker thread and join it
~periodic_worker();
private:
bool active_;
std::thread worker_thread_;
std::mutex mutex_;
std::condition_variable cv_;
};
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "periodic_worker-inl.h"
#endif

View File

@@ -0,0 +1,270 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#ifndef SPDLOG_HEADER_ONLY
#include <spdlog/details/registry.h>
#endif
#include <spdlog/common.h>
#include <spdlog/details/periodic_worker.h>
#include <spdlog/logger.h>
#include <spdlog/pattern_formatter.h>
#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
// support for the default stdout color logger
#ifdef _WIN32
#include <spdlog/sinks/wincolor_sink.h>
#else
#include <spdlog/sinks/ansicolor_sink.h>
#endif
#endif // SPDLOG_DISABLE_DEFAULT_LOGGER
#include <chrono>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
namespace spdlog {
namespace details {
SPDLOG_INLINE registry::registry()
: formatter_(new pattern_formatter()) {
#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
// create default logger (ansicolor_stdout_sink_mt or wincolor_stdout_sink_mt in windows).
#ifdef _WIN32
auto color_sink = std::make_shared<sinks::wincolor_stdout_sink_mt>();
#else
auto color_sink = std::make_shared<sinks::ansicolor_stdout_sink_mt>();
#endif
const char *default_logger_name = "";
default_logger_ = std::make_shared<spdlog::logger>(default_logger_name, std::move(color_sink));
loggers_[default_logger_name] = default_logger_;
#endif // SPDLOG_DISABLE_DEFAULT_LOGGER
}
SPDLOG_INLINE registry::~registry() = default;
SPDLOG_INLINE void registry::register_logger(std::shared_ptr<logger> new_logger) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
register_logger_(std::move(new_logger));
}
SPDLOG_INLINE void registry::register_or_replace(std::shared_ptr<logger> new_logger) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
register_or_replace_(std::move(new_logger));
}
SPDLOG_INLINE void registry::initialize_logger(std::shared_ptr<logger> new_logger) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
new_logger->set_formatter(formatter_->clone());
if (err_handler_) {
new_logger->set_error_handler(err_handler_);
}
// set new level according to previously configured level or default level
auto it = log_levels_.find(new_logger->name());
auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
new_logger->set_level(new_level);
new_logger->flush_on(flush_level_);
if (backtrace_n_messages_ > 0) {
new_logger->enable_backtrace(backtrace_n_messages_);
}
if (automatic_registration_) {
register_logger_(std::move(new_logger));
}
}
SPDLOG_INLINE std::shared_ptr<logger> registry::get(const std::string &logger_name) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
auto found = loggers_.find(logger_name);
return found == loggers_.end() ? nullptr : found->second;
}
SPDLOG_INLINE std::shared_ptr<logger> registry::default_logger() {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
return default_logger_;
}
// Return raw ptr to the default logger.
// To be used directly by the spdlog default api (e.g. spdlog::info)
// This make the default API faster, but cannot be used concurrently with set_default_logger().
// e.g do not call set_default_logger() from one thread while calling spdlog::info() from another.
SPDLOG_INLINE logger *registry::get_default_raw() { return default_logger_.get(); }
// set default logger.
// the default logger is stored in default_logger_ (for faster retrieval) and in the loggers_ map.
SPDLOG_INLINE void registry::set_default_logger(std::shared_ptr<logger> new_default_logger) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
if (new_default_logger != nullptr) {
loggers_[new_default_logger->name()] = new_default_logger;
}
default_logger_ = std::move(new_default_logger);
}
SPDLOG_INLINE void registry::set_tp(std::shared_ptr<thread_pool> tp) {
std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
tp_ = std::move(tp);
}
SPDLOG_INLINE std::shared_ptr<thread_pool> registry::get_tp() {
std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
return tp_;
}
// Set global formatter. Each sink in each logger will get a clone of this object
SPDLOG_INLINE void registry::set_formatter(std::unique_ptr<formatter> formatter) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
formatter_ = std::move(formatter);
for (auto &l : loggers_) {
l.second->set_formatter(formatter_->clone());
}
}
SPDLOG_INLINE void registry::enable_backtrace(size_t n_messages) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
backtrace_n_messages_ = n_messages;
for (auto &l : loggers_) {
l.second->enable_backtrace(n_messages);
}
}
SPDLOG_INLINE void registry::disable_backtrace() {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
backtrace_n_messages_ = 0;
for (auto &l : loggers_) {
l.second->disable_backtrace();
}
}
SPDLOG_INLINE void registry::set_level(level::level_enum log_level) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
for (auto &l : loggers_) {
l.second->set_level(log_level);
}
global_log_level_ = log_level;
}
SPDLOG_INLINE void registry::flush_on(level::level_enum log_level) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
for (auto &l : loggers_) {
l.second->flush_on(log_level);
}
flush_level_ = log_level;
}
SPDLOG_INLINE void registry::set_error_handler(err_handler handler) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
for (auto &l : loggers_) {
l.second->set_error_handler(handler);
}
err_handler_ = std::move(handler);
}
SPDLOG_INLINE void registry::apply_all(
const std::function<void(const std::shared_ptr<logger>)> &fun) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
for (auto &l : loggers_) {
fun(l.second);
}
}
SPDLOG_INLINE void registry::flush_all() {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
for (auto &l : loggers_) {
l.second->flush();
}
}
SPDLOG_INLINE void registry::drop(const std::string &logger_name) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
auto is_default_logger = default_logger_ && default_logger_->name() == logger_name;
loggers_.erase(logger_name);
if (is_default_logger) {
default_logger_.reset();
}
}
SPDLOG_INLINE void registry::drop_all() {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
loggers_.clear();
default_logger_.reset();
}
// clean all resources and threads started by the registry
SPDLOG_INLINE void registry::shutdown() {
{
std::lock_guard<std::mutex> lock(flusher_mutex_);
periodic_flusher_.reset();
}
drop_all();
{
std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
tp_.reset();
}
}
SPDLOG_INLINE std::recursive_mutex &registry::tp_mutex() { return tp_mutex_; }
SPDLOG_INLINE void registry::set_automatic_registration(bool automatic_registration) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
automatic_registration_ = automatic_registration;
}
SPDLOG_INLINE void registry::set_levels(log_levels levels, level::level_enum *global_level) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
log_levels_ = std::move(levels);
auto global_level_requested = global_level != nullptr;
global_log_level_ = global_level_requested ? *global_level : global_log_level_;
for (auto &logger : loggers_) {
auto logger_entry = log_levels_.find(logger.first);
if (logger_entry != log_levels_.end()) {
logger.second->set_level(logger_entry->second);
} else if (global_level_requested) {
logger.second->set_level(*global_level);
}
}
}
SPDLOG_INLINE registry &registry::instance() {
static registry s_instance;
return s_instance;
}
SPDLOG_INLINE void registry::apply_logger_env_levels(std::shared_ptr<logger> new_logger) {
std::lock_guard<std::mutex> lock(logger_map_mutex_);
auto it = log_levels_.find(new_logger->name());
auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
new_logger->set_level(new_level);
}
SPDLOG_INLINE void registry::throw_if_exists_(const std::string &logger_name) {
if (loggers_.find(logger_name) != loggers_.end()) {
throw_spdlog_ex("logger with name '" + logger_name + "' already exists");
}
}
SPDLOG_INLINE void registry::register_logger_(std::shared_ptr<logger> new_logger) {
auto &logger_name = new_logger->name();
throw_if_exists_(logger_name);
loggers_[logger_name] = std::move(new_logger);
}
SPDLOG_INLINE void registry::register_or_replace_(std::shared_ptr<logger> new_logger) {
loggers_[new_logger->name()] = std::move(new_logger);
}
} // namespace details
} // namespace spdlog

View File

@@ -0,0 +1,131 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
// Loggers registry of unique name->logger pointer
// An attempt to create a logger with an already existing name will result with spdlog_ex exception.
// If user requests a non existing logger, nullptr will be returned
// This class is thread safe
#include <spdlog/common.h>
#include <spdlog/details/periodic_worker.h>
#include <chrono>
#include <functional>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
namespace spdlog {
class logger;
namespace details {
class thread_pool;
class SPDLOG_API registry {
public:
using log_levels = std::unordered_map<std::string, level::level_enum>;
registry(const registry &) = delete;
registry &operator=(const registry &) = delete;
void register_logger(std::shared_ptr<logger> new_logger);
void register_or_replace(std::shared_ptr<logger> new_logger);
void initialize_logger(std::shared_ptr<logger> new_logger);
std::shared_ptr<logger> get(const std::string &logger_name);
std::shared_ptr<logger> default_logger();
// Return raw ptr to the default logger.
// To be used directly by the spdlog default api (e.g. spdlog::info)
// This make the default API faster, but cannot be used concurrently with set_default_logger().
// e.g do not call set_default_logger() from one thread while calling spdlog::info() from
// another.
logger *get_default_raw();
// set default logger and add it to the registry if not registered already.
// default logger is stored in default_logger_ (for faster retrieval) and in the loggers_ map.
// Note: Make sure to unregister it when no longer needed or before calling again with a new
// logger.
void set_default_logger(std::shared_ptr<logger> new_default_logger);
void set_tp(std::shared_ptr<thread_pool> tp);
std::shared_ptr<thread_pool> get_tp();
// Set global formatter. Each sink in each logger will get a clone of this object
void set_formatter(std::unique_ptr<formatter> formatter);
void enable_backtrace(size_t n_messages);
void disable_backtrace();
void set_level(level::level_enum log_level);
void flush_on(level::level_enum log_level);
template <typename Rep, typename Period>
void flush_every(std::chrono::duration<Rep, Period> interval) {
std::lock_guard<std::mutex> lock(flusher_mutex_);
auto clbk = [this]() { this->flush_all(); };
periodic_flusher_ = details::make_unique<periodic_worker>(clbk, interval);
}
std::unique_ptr<periodic_worker> &get_flusher() {
std::lock_guard<std::mutex> lock(flusher_mutex_);
return periodic_flusher_;
}
void set_error_handler(err_handler handler);
void apply_all(const std::function<void(const std::shared_ptr<logger>)> &fun);
void flush_all();
void drop(const std::string &logger_name);
void drop_all();
// clean all resources and threads started by the registry
void shutdown();
std::recursive_mutex &tp_mutex();
void set_automatic_registration(bool automatic_registration);
// set levels for all existing/future loggers. global_level can be null if should not set.
void set_levels(log_levels levels, level::level_enum *global_level);
static registry &instance();
void apply_logger_env_levels(std::shared_ptr<logger> new_logger);
private:
registry();
~registry();
void throw_if_exists_(const std::string &logger_name);
void register_logger_(std::shared_ptr<logger> new_logger);
void register_or_replace_(std::shared_ptr<logger> new_logger);
bool set_level_from_cfg_(logger *logger);
std::mutex logger_map_mutex_, flusher_mutex_;
std::recursive_mutex tp_mutex_;
std::unordered_map<std::string, std::shared_ptr<logger>> loggers_;
log_levels log_levels_;
std::unique_ptr<formatter> formatter_;
spdlog::level::level_enum global_log_level_ = level::info;
level::level_enum flush_level_ = level::off;
err_handler err_handler_;
std::shared_ptr<thread_pool> tp_;
std::unique_ptr<periodic_worker> periodic_flusher_;
std::shared_ptr<logger> default_logger_;
bool automatic_registration_ = true;
size_t backtrace_n_messages_ = 0;
};
} // namespace details
} // namespace spdlog
#ifdef SPDLOG_HEADER_ONLY
#include "registry-inl.h"
#endif

View File

@@ -0,0 +1,22 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#include "registry.h"
namespace spdlog {
// Default logger factory- creates synchronous loggers
class logger;
struct synchronous_factory {
template <typename Sink, typename... SinkArgs>
static std::shared_ptr<spdlog::logger> create(std::string logger_name, SinkArgs &&...args) {
auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
auto new_logger = std::make_shared<spdlog::logger>(std::move(logger_name), std::move(sink));
details::registry::instance().initialize_logger(new_logger);
return new_logger;
}
};
} // namespace spdlog

View File

@@ -0,0 +1,217 @@
// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
// Distributed under the MIT License (http://opensource.org/licenses/MIT)
#pragma once
#define WIN32_LEAN_AND_MEAN
// tcp client helper
#include <spdlog/common.h>
#include <spdlog/details/os.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <windows.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#pragma comment(lib, "Ws2_32.lib")
#pragma comment(lib, "Mswsock.lib")
#pragma comment(lib, "AdvApi32.lib")
namespace spdlog {
namespace details {
class tcp_client {
SOCKET socket_ = INVALID_SOCKET;
static void init_winsock_() {
WSADATA wsaData;
auto rv = WSAStartup(MAKEWORD(2, 2), &wsaData);
if (rv != 0) {
throw_winsock_error_("WSAStartup failed", ::WSAGetLastError());
}
}
static void throw_winsock_error_(const std::string &msg, int last_error) {
char buf[512];
::FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL,
last_error, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf,
(sizeof(buf) / sizeof(char)), NULL);
throw_spdlog_ex(fmt_lib::format("tcp_sink - {}: {}", msg, buf));
}
public:
tcp_client() { init_winsock_(); }
~tcp_client() {
close();
::WSACleanup();
}
bool is_connected() const { return socket_ != INVALID_SOCKET; }
void close() {
::closesocket(socket_);
socket_ = INVALID_SOCKET;
}
SOCKET fd() const { return socket_; }
int connect_socket_with_timeout(SOCKET sockfd,
const struct sockaddr *addr,
int addrlen,
const timeval &tv) {
// If no timeout requested, do a normal blocking connect.
if (tv.tv_sec == 0 && tv.tv_usec == 0) {
int rv = ::connect(sockfd, addr, addrlen);
if (rv == SOCKET_ERROR && WSAGetLastError() == WSAEISCONN) {
return 0;
}
return rv;
}
// Switch to nonblocking mode
u_long mode = 1UL;
if (::ioctlsocket(sockfd, FIONBIO, &mode) == SOCKET_ERROR) {
return SOCKET_ERROR;
}
int rv = ::connect(sockfd, addr, addrlen);
int last_error = WSAGetLastError();
if (rv == 0 || last_error == WSAEISCONN) {
mode = 0UL;
if (::ioctlsocket(sockfd, FIONBIO, &mode) == SOCKET_ERROR) {
return SOCKET_ERROR;
}
return 0;
}
if (last_error != WSAEWOULDBLOCK) {
// Real error
mode = 0UL;
if (::ioctlsocket(sockfd, FIONBIO, &mode)) {
return SOCKET_ERROR;
}
return SOCKET_ERROR;
}
// Wait until socket is writable or timeout expires
fd_set wfds;
FD_ZERO(&wfds);
FD_SET(sockfd, &wfds);
rv = ::select(0, nullptr, &wfds, nullptr, const_cast<timeval *>(&tv));
// Restore blocking mode regardless of select result
mode = 0UL;
if (::ioctlsocket(sockfd, FIONBIO, &mode) == SOCKET_ERROR) {
return SOCKET_ERROR;
}
if (rv == 0) {
WSASetLastError(WSAETIMEDOUT);
return SOCKET_ERROR;
}
if (rv == SOCKET_ERROR) {
return SOCKET_ERROR;
}
int so_error = 0;
int len = sizeof(so_error);
if (::getsockopt(sockfd, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&so_error), &len) ==
SOCKET_ERROR) {
return SOCKET_ERROR;
}
if (so_error != 0 && so_error != WSAEISCONN) {
// connection failed
WSASetLastError(so_error);
return SOCKET_ERROR;
}
return 0; // success
}
// try to connect or throw on failure
void connect(const std::string &host, int port, int timeout_ms = 0) {
if (is_connected()) {
close();
}
struct addrinfo hints {};
ZeroMemory(&hints, sizeof(hints));
hints.ai_family = AF_UNSPEC; // To work with IPv4, IPv6, and so on
hints.ai_socktype = SOCK_STREAM; // TCP
hints.ai_flags = AI_NUMERICSERV; // port passed as as numeric value
hints.ai_protocol = 0;
timeval tv;
tv.tv_sec = timeout_ms / 1000;
tv.tv_usec = (timeout_ms % 1000) * 1000;
auto port_str = std::to_string(port);
struct addrinfo *addrinfo_result;
auto rv = ::getaddrinfo(host.c_str(), port_str.c_str(), &hints, &addrinfo_result);
int last_error = 0;
if (rv != 0) {
last_error = ::WSAGetLastError();
WSACleanup();
throw_winsock_error_("getaddrinfo failed", last_error);
}
// Try each address until we successfully connect(2).
for (auto *rp = addrinfo_result; rp != nullptr; rp = rp->ai_next) {
socket_ = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if (socket_ == INVALID_SOCKET) {
last_error = ::WSAGetLastError();
WSACleanup();
continue;
}
if (connect_socket_with_timeout(socket_, rp->ai_addr, (int)rp->ai_addrlen, tv) == 0) {
last_error = 0;
break;
}
last_error = WSAGetLastError();
::closesocket(socket_);
socket_ = INVALID_SOCKET;
}
::freeaddrinfo(addrinfo_result);
if (socket_ == INVALID_SOCKET) {
WSACleanup();
throw_winsock_error_("connect failed", last_error);
}
if (timeout_ms > 0) {
DWORD tv = static_cast<DWORD>(timeout_ms);
::setsockopt(socket_, SOL_SOCKET, SO_RCVTIMEO, (const char *)&tv, sizeof(tv));
::setsockopt(socket_, SOL_SOCKET, SO_SNDTIMEO, (const char *)&tv, sizeof(tv));
}
// set TCP_NODELAY
int enable_flag = 1;
::setsockopt(socket_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&enable_flag),
sizeof(enable_flag));
}
// Send exactly n_bytes of the given data.
// On error close the connection and throw.
void send(const char *data, size_t n_bytes) {
size_t bytes_sent = 0;
while (bytes_sent < n_bytes) {
const int send_flags = 0;
auto write_result =
::send(socket_, data + bytes_sent, (int)(n_bytes - bytes_sent), send_flags);
if (write_result == SOCKET_ERROR) {
int last_error = ::WSAGetLastError();
close();
throw_winsock_error_("send failed", last_error);
}
if (write_result == 0) // (probably should not happen but in any case..)
{
break;
}
bytes_sent += static_cast<size_t>(write_result);
}
}
};
} // namespace details
} // namespace spdlog

Some files were not shown because too many files have changed in this diff Show More