[CI][lint] Add rule codespell back (#6236)
### What this PR does / why we need it?
After removing codepsell a while, we discovered that typo had a problem
correctly recognizing certain misspelled words, so I suggested adding it
back.
- vLLM version: v0.14.1
- vLLM main:
d68209402d
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@@ -29,7 +29,7 @@
|
||||
/.readthedocs.yaml @wangxiyuan @Yikun
|
||||
/README* @wangxiyuan @Yikun
|
||||
|
||||
# exmaple
|
||||
# example
|
||||
/examples @wangxiyuan
|
||||
|
||||
# tests
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/110-user-story.yml
vendored
4
.github/ISSUE_TEMPLATE/110-user-story.yml
vendored
@@ -18,7 +18,7 @@ body:
|
||||
A brief introduction about the background of your use case, like your scenario, hardware size etc.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Bussiness Challenges
|
||||
label: Business Challenges
|
||||
description: >
|
||||
Tell us how what kind of challenge you faced in this user story.
|
||||
- type: textarea
|
||||
@@ -30,7 +30,7 @@ body:
|
||||
attributes:
|
||||
label: Extra Info
|
||||
description: >
|
||||
Any extra infomation you want to include in this story
|
||||
Any extra information you want to include in this story
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
|
||||
2
.github/workflows/_schedule_image_build.yaml
vendored
2
.github/workflows/_schedule_image_build.yaml
vendored
@@ -139,7 +139,7 @@ jobs:
|
||||
quay.io/ascend/vllm-ascend
|
||||
# Note for test case
|
||||
# https://github.com/marketplace/actions/docker-metadata-action#typeref
|
||||
# 1. branch job pulish per main/*-dev branch commits
|
||||
# 1. branch job publish per main/*-dev branch commits
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N-openeuler is fine
|
||||
# 3. only pep440 matched tag will be published:
|
||||
# - v0.7.1 --> v0.7.1-openeuler
|
||||
|
||||
@@ -11,6 +11,17 @@ repos:
|
||||
- id: ruff-check
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
hooks:
|
||||
- id: codespell
|
||||
args: [
|
||||
--toml, pyproject.toml,
|
||||
'--skip', 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,typos.toml',
|
||||
'-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND,ND,tbe,copyin,alog'
|
||||
]
|
||||
additional_dependencies:
|
||||
- tomli
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.32.0
|
||||
hooks:
|
||||
|
||||
@@ -144,10 +144,10 @@ static bool CheckInputOutputDim(const gert::TilingContext* context)
|
||||
|
||||
OP_CHECK_IF(
|
||||
x1DimNum != x2DimNum,
|
||||
OP_LOGE(context, "Input x2/x1 shape invaild, dim num is not equal x1 dim."), return false);
|
||||
OP_LOGE(context, "Input x2/x1 shape invalid, dim num is not equal x1 dim."), return false);
|
||||
OP_CHECK_IF(
|
||||
(yDimNum != xDimNum) || (xDimNum != x1DimNum) || (rstdDimNum != x1DimNum),
|
||||
OP_LOGE(context, "Output y/x/rstd shape invaild, dim num is not equal x1 dim."), return false);
|
||||
OP_LOGE(context, "Output y/x/rstd shape invalid, dim num is not equal x1 dim."), return false);
|
||||
OP_CHECK_IF(
|
||||
x1DimNum < gammaDimNum, OP_LOGE(context, "X1 dim num should not be smaller than gamma dim num."),
|
||||
return false);
|
||||
@@ -180,26 +180,26 @@ static bool CheckInputOutputShape(const gert::TilingContext* context)
|
||||
return false);
|
||||
OP_CHECK_IF(
|
||||
x2_shape->GetStorageShape().GetDim(i) != x1_shape->GetStorageShape().GetDim(i),
|
||||
OP_LOGE(context, "Input x2/x1 shape invaild, shape is not equal x1 shape."), return false);
|
||||
OP_LOGE(context, "Input x2/x1 shape invalid, shape is not equal x1 shape."), return false);
|
||||
OP_CHECK_IF(
|
||||
(y_shape->GetStorageShape().GetDim(i) != x1_shape->GetStorageShape().GetDim(i)) ||
|
||||
(x_shape->GetStorageShape().GetDim(i) != x1_shape->GetStorageShape().GetDim(i)),
|
||||
OP_LOGE(context, "Input y/x shape invaild, shape is not equal x1 shape."), return false);
|
||||
OP_LOGE(context, "Input y/x shape invalid, shape is not equal x1 shape."), return false);
|
||||
}
|
||||
for (uint32_t i = 0; i < x1DimNum - gammaDimNum; i++) {
|
||||
OP_CHECK_IF(
|
||||
rstd_shape->GetStorageShape().GetDim(i) != x2_shape->GetStorageShape().GetDim(i),
|
||||
OP_LOGE(context, "Output rstd shape invaild, shape is not equal x1 first few dim."),
|
||||
OP_LOGE(context, "Output rstd shape invalid, shape is not equal x1 first few dim."),
|
||||
return false);
|
||||
}
|
||||
for (uint32_t i = 0; i < gammaDimNum; i++) {
|
||||
OP_CHECK_IF(
|
||||
gamma_shape->GetStorageShape().GetDim(i) != x1_shape->GetStorageShape().GetDim(x1DimNum - gammaDimNum + i),
|
||||
OP_LOGE(context, "Input gamma shape invaild, gamma shape is not equal x1 last few dim."),
|
||||
OP_LOGE(context, "Input gamma shape invalid, gamma shape is not equal x1 last few dim."),
|
||||
return false);
|
||||
OP_CHECK_IF(
|
||||
rstd_shape->GetStorageShape().GetDim(x1DimNum - 1 - i) != 1,
|
||||
OP_LOGE(context, "Output rstd shape invaild, last few dim is not equal to 1."),
|
||||
OP_LOGE(context, "Output rstd shape invalid, last few dim is not equal to 1."),
|
||||
return false);
|
||||
}
|
||||
return true;
|
||||
|
||||
@@ -11,11 +11,11 @@ if [[ "$SOC_VERSION" =~ ^ascend310 ]]; then
|
||||
exit 0
|
||||
elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
|
||||
# ASCEND910B (A2) series
|
||||
# depdendency: catlass
|
||||
# dependency: catlass
|
||||
git config --global --add safe.directory "$ROOT_DIR"
|
||||
CATLASS_PATH=${ROOT_DIR}/csrc/third_party/catlass/include
|
||||
if [[ ! -d "${CATLASS_PATH}" ]]; then
|
||||
echo "depdendency catlass is missing, try to fetch it..."
|
||||
echo "dependency catlass is missing, try to fetch it..."
|
||||
if ! git submodule update --init --recursive; then
|
||||
echo "fetch failed"
|
||||
exit 1
|
||||
@@ -28,17 +28,17 @@ elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
|
||||
SOC_ARG="ascend910b"
|
||||
elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
|
||||
# ASCEND910C (A3) series
|
||||
# depdendency: catlass
|
||||
# dependency: catlass
|
||||
git config --global --add safe.directory "$ROOT_DIR"
|
||||
CATLASS_PATH=${ROOT_DIR}/csrc/third_party/catlass/include
|
||||
if [[ ! -d "${CATLASS_PATH}" ]]; then
|
||||
echo "depdendency catlass is missing, try to fetch it..."
|
||||
echo "dependency catlass is missing, try to fetch it..."
|
||||
if ! git submodule update --init --recursive; then
|
||||
echo "fetch failed"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# depdendency: cann-toolkit file moe_distribute_base.h
|
||||
# dependency: cann-toolkit file moe_distribute_base.h
|
||||
HCCL_STRUCT_FILE_PATH=$(find -L "${ASCEND_TOOLKIT_HOME}" -name "moe_distribute_base.h" 2>/dev/null | head -n1)
|
||||
if [ -z "$HCCL_STRUCT_FILE_PATH" ]; then
|
||||
echo "cannot find moe_distribute_base.h file in CANN env"
|
||||
|
||||
@@ -162,7 +162,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCumsum(bool isTai
|
||||
this->expertTokenValue += this->expertTokenIdxOutLocal.GetValue(i);
|
||||
this->expertTokenIdxOutLocal.SetValue(i, this->expertTokenValue);
|
||||
}
|
||||
// if the remianing UB is sufficient, use the UB space to copy
|
||||
// if the remaining UB is sufficient, use the UB space to copy
|
||||
// otherwise, copy the calculated data first, and then copy the last tokenValue to remaining expert position
|
||||
if (isTail && end <= this->expertNumUbAlign) {
|
||||
int64_t startAlign = Min(Align(copyLength, sizeof(int32_t)), end);
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace Catlass::Gemm::Tile {
|
||||
|
||||
static constexpr uint32_t ELE_NUM_PER_C0 = BYTE_PER_C0 / sizeof(Element); // int64, 32/8=4
|
||||
|
||||
// Mehtods
|
||||
// Methods
|
||||
|
||||
CATLASS_DEVICE
|
||||
CopyGmToL1() {};
|
||||
|
||||
@@ -177,7 +177,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCumsum(bool isTai
|
||||
this->expertTokenValue += this->expertTokenIdxOutLocal.GetValue(i);
|
||||
this->expertTokenIdxOutLocal.SetValue(i, this->expertTokenValue);
|
||||
}
|
||||
// if the remianing UB is sufficient, use the UB space to copy
|
||||
// if the remaining UB is sufficient, use the UB space to copy
|
||||
// otherwise, copy the calculated data first, and then copy the last tokenValue to remaining expert position
|
||||
if (isTail && end <= this->expertNumUbAlign) {
|
||||
int64_t startAlign = Min(Align(copyLength, sizeof(int32_t)), end);
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace Catlass::Gemm::Tile {
|
||||
|
||||
static constexpr uint32_t ELE_NUM_PER_C0 = BYTE_PER_C0 / sizeof(Element); // int64, 32/8=4
|
||||
|
||||
// Mehtods
|
||||
// Methods
|
||||
|
||||
CATLASS_DEVICE
|
||||
CopyGmToL1() {};
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* funtion: aclnnDispatchLayoutGetWorkspaceSize
|
||||
/* function: aclnnDispatchLayoutGetWorkspaceSize
|
||||
* topkIdx : required
|
||||
* numTokens : required
|
||||
* numRanks : required
|
||||
@@ -31,7 +31,7 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchLayoutGetWorkspa
|
||||
uint64_t *workspaceSize,
|
||||
aclOpExecutor **executor);
|
||||
|
||||
/* funtion: aclnnDispatchLayout
|
||||
/* function: aclnnDispatchLayout
|
||||
* workspace : workspace memory addr(input).
|
||||
* workspaceSize : size of workspace(input).
|
||||
* executor : executor context(input).
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
#include "grouped_matmul_swiglu_quant_weight_nz_tensor_list_utils.h"
|
||||
namespace GROUPED_MATMUL_SWIGLU_QUANT_WEIGHT_NZ_TENSOR_LIST {
|
||||
/** @brief intenal computation class
|
||||
/** @brief internal computation class
|
||||
*/
|
||||
template <class mmType, bool sync = false, typename CHANNELDTYPE = float>
|
||||
class GMMSwigluCompute{
|
||||
|
||||
@@ -374,7 +374,7 @@ ge::graphStatus LIInfoParser::GetHeadDim()
|
||||
dIndex = DIM_IDX_TWO;
|
||||
break;
|
||||
case DataLayout::BSND:
|
||||
// BSND: [Batch, SeqLen, N, D] -> D is the 3nd dimension
|
||||
// BSND: [Batch, SeqLen, N, D] -> D is the 3rd dimension
|
||||
dIndex = DIM_IDX_THREE;
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* funtion: aclnnMoeCombineGetWorkspaceSize
|
||||
/* function: aclnnMoeCombineGetWorkspaceSize
|
||||
* recvX : required
|
||||
* tokenSrcInfo : required
|
||||
* epRecvCounts : required
|
||||
@@ -43,7 +43,7 @@ __attribute__((visibility("default"))) aclnnStatus aclnnMoeCombineNormalGetWorks
|
||||
uint64_t *workspaceSize,
|
||||
aclOpExecutor **executor);
|
||||
|
||||
/* funtion: aclnnMoeCombine
|
||||
/* function: aclnnMoeCombine
|
||||
* workspace : workspace memory addr(input).
|
||||
* workspaceSize : size of workspace(input).
|
||||
* executor : executor context(input).
|
||||
|
||||
@@ -419,7 +419,7 @@ static ge::graphStatus SetWorkspace(gert::TilingContext *context, const char *no
|
||||
OPS_CHECK(workspace == nullptr, OPS_LOG_E(nodeName, "get workspace failed"),
|
||||
return ge::GRAPH_FAILED);
|
||||
workspace[0] = SYSTEM_NEED_WORKSPACE;
|
||||
OPS_LOG_D(nodeName, "workspce[0] size is %ld", workspace[0]);
|
||||
OPS_LOG_D(nodeName, "workspace[0] size is %ld", workspace[0]);
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ namespace ge {
|
||||
* @li out: A 2D tensor which is the renorm result of moe gating topk, format supports ND, and data type must be float. The shape must be the same as that of x.
|
||||
*
|
||||
* @par Attributes:
|
||||
* @li k: A required attribute of type int. The value must greater than 0 and less than or equal to expert_num / group_count * k_group, idicating the topk value.
|
||||
* @li k: A required attribute of type int. The value must greater than 0 and less than or equal to expert_num / group_count * k_group, indicating the topk value.
|
||||
* @li k_group: An optional attribute of type int. It can not be less than 1, and can not be greater than group_count, indicating the topk group value. The default value is 1.
|
||||
* @li group_count: An optional attribute of type int. It can not be less than 1, indicating the group count. The group_count * align_32(expert_num / group_count) can not be greater than 2048. The default value is 1.
|
||||
* @li group_select_mode: An optional attribute of type int. 0 indicating that sort group by max values, 1 indicating that sort group by sum of top-2 values. The default value is 0.
|
||||
|
||||
@@ -444,11 +444,11 @@ ge::graphStatus MoeGatingTopKTilingBase::CheckOutShape()
|
||||
}
|
||||
|
||||
OP_CHECK_IF((yShape_->GetDim(0) != xShape_->GetDim(0)),
|
||||
OP_LOGE(context_, "y out dim[0] %ld not euqal x dim[0] %ld, please check.", yShape_->GetDim(0),
|
||||
OP_LOGE(context_, "y out dim[0] %ld not equal x dim[0] %ld, please check.", yShape_->GetDim(0),
|
||||
xShape_->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
OP_CHECK_IF((expertIdxShape_->GetDim(0) != xShape_->GetDim(0)),
|
||||
OP_LOGE(context_, "expertId out dim[0] %ld not euqal x dim[0] %ld, please check.",
|
||||
OP_LOGE(context_, "expertId out dim[0] %ld not equal x dim[0] %ld, please check.",
|
||||
expertIdxShape_->GetDim(0), xShape_->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
if (outFlag_ && outShape_ != nullptr) {
|
||||
@@ -459,10 +459,10 @@ ge::graphStatus MoeGatingTopKTilingBase::CheckOutShape()
|
||||
}
|
||||
|
||||
OP_CHECK_IF((yShape_->GetDim(1) != k_),
|
||||
OP_LOGE(context_, "y dim[1] %ld not euqal k %ld, please check.", yShape_->GetDim(1), k_),
|
||||
OP_LOGE(context_, "y dim[1] %ld not equal k %ld, please check.", yShape_->GetDim(1), k_),
|
||||
return ge::GRAPH_FAILED);
|
||||
OP_CHECK_IF((expertIdxShape_->GetDim(1) != k_),
|
||||
OP_LOGE(context_, "expertId dim[1] %ld not euqal k %ld, please check.", expertIdxShape_->GetDim(1), k_),
|
||||
OP_LOGE(context_, "expertId dim[1] %ld not equal k %ld, please check.", expertIdxShape_->GetDim(1), k_),
|
||||
return ge::GRAPH_FAILED);
|
||||
if (outFlag_ && outShape_ != nullptr) {
|
||||
OP_CHECK_IF((outShape_->GetDim(1) != xShape_->GetDim(1)),
|
||||
|
||||
@@ -400,11 +400,11 @@ ge::graphStatus MoeGatingTopKTilingRegbase::CheckOutShape()
|
||||
}
|
||||
|
||||
OP_CHECK_IF((yShape_->GetDim(0) != xShape_->GetDim(0)),
|
||||
OP_LOGE(context_, "y out dim[0] %ld not euqal x dim[0] %ld, please check.", yShape_->GetDim(0),
|
||||
OP_LOGE(context_, "y out dim[0] %ld not equal x dim[0] %ld, please check.", yShape_->GetDim(0),
|
||||
xShape_->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
OP_CHECK_IF((expertIdxShape_->GetDim(0) != xShape_->GetDim(0)),
|
||||
OP_LOGE(context_, "expertId out dim[0] %ld not euqal x dim[0] %ld, please check.",
|
||||
OP_LOGE(context_, "expertId out dim[0] %ld not equal x dim[0] %ld, please check.",
|
||||
expertIdxShape_->GetDim(0), xShape_->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
if (outFlag_ && outShape_ != nullptr) {
|
||||
@@ -415,10 +415,10 @@ ge::graphStatus MoeGatingTopKTilingRegbase::CheckOutShape()
|
||||
}
|
||||
|
||||
OP_CHECK_IF((yShape_->GetDim(1) != k_),
|
||||
OP_LOGE(context_, "y dim[1] %ld not euqal k %ld, please check.", yShape_->GetDim(1), k_),
|
||||
OP_LOGE(context_, "y dim[1] %ld not equal k %ld, please check.", yShape_->GetDim(1), k_),
|
||||
return ge::GRAPH_FAILED);
|
||||
OP_CHECK_IF((expertIdxShape_->GetDim(1) != k_),
|
||||
OP_LOGE(context_, "expertId dim[1] %ld not euqal k %ld, please check.", expertIdxShape_->GetDim(1), k_),
|
||||
OP_LOGE(context_, "expertId dim[1] %ld not equal k %ld, please check.", expertIdxShape_->GetDim(1), k_),
|
||||
return ge::GRAPH_FAILED);
|
||||
if (outFlag_ && outShape_ != nullptr) {
|
||||
OP_CHECK_IF((outShape_->GetDim(1) != xShape_->GetDim(1)),
|
||||
|
||||
@@ -41,7 +41,7 @@ template <typename T, typename... Args> constexpr uint64_t RecursiveSum(T templa
|
||||
// Format: Represents the Format supported by the current tiling key, using InputLayout enum, occupies one decimal digit
|
||||
// Sparse: Represents whether the current tiling key supports Sparse, using SparseCapability enum, occupies one decimal digit
|
||||
// For other specialized scenarios, define your own bit fields and values
|
||||
// usage: get tilingKey from inputed types
|
||||
// usage: get tilingKey from inputted types
|
||||
// uint64_t tilingKey = GET_FLASHATTENTION_TILINGKEY(AxisEnum::AXIS_S1, AxisEnum::AXIS_S2, AxisEnum::AXIS_N2,
|
||||
// SupportedDtype::FLOAT32, InputLayout::BSH, SparseCapability::SUPPORT_ALL)
|
||||
|
||||
@@ -51,7 +51,7 @@ template <typename... Args> constexpr uint64_t GET_TILINGKEY(Args... templateIds
|
||||
return TILINGKEYOFFSET + RecursiveSum(templateIds...);
|
||||
}
|
||||
|
||||
// usage: get tilingKey from inputed types
|
||||
// usage: get tilingKey from inputted types
|
||||
// uint64_t tilingKey = TILINGKEY(S2, S1, N2, FLOAT32, BSND, ALL)
|
||||
|
||||
#define TILINGKEY(ub2, ub1, block, dtype, layout, sparse) \
|
||||
|
||||
@@ -119,7 +119,7 @@ template <typename T, typename... Args> constexpr uint64_t RecursiveSum(T templa
|
||||
// Format: Represents the Format supported by the current tiling key, using InputLayout enum, occupies one decimal digit
|
||||
// Sparse: Represents whether the current tiling key supports Sparse, using SparseCapability enum, occupies one decimal digit
|
||||
// For other specialized scenarios, define your own bit fields and values
|
||||
// usage: get tilingKey from inputed types
|
||||
// usage: get tilingKey from inputted types
|
||||
// uint64_t tilingKey = GET_FLASHATTENTION_TILINGKEY(AxisEnum::AXIS_S1, AxisEnum::AXIS_S2, AxisEnum::AXIS_N2,
|
||||
// SupportedDtype::FLOAT32, InputLayout::BSH, SparseCapability::SUPPORT_ALL)
|
||||
|
||||
@@ -129,7 +129,7 @@ template <typename... Args> constexpr uint64_t GET_TILINGKEY(Args... templateIds
|
||||
return TILINGKEYOFFSET + RecursiveSum(templateIds...);
|
||||
}
|
||||
|
||||
// usage: get tilingKey from inputed types
|
||||
// usage: get tilingKey from inputted types
|
||||
// uint64_t tilingKey = TILINGKEY(S2, S1, N2, FLOAT32, BSND, ALL)
|
||||
|
||||
#define TILINGKEY(ub2, ub1, block, dtype, layout, sparse) \
|
||||
|
||||
@@ -903,7 +903,7 @@ void MoeInitRountingCustomTilingBase::Tinlig4VBSMultiCoreCompute(MoeCustomVBSCom
|
||||
needCoreNum = std::min(needCoreNum, aivNum);
|
||||
|
||||
if (needCoreNum == 0) {
|
||||
OPS_LOG_E(context_->GetNodeName(), "Variale needCoreNum cannot be 0.");
|
||||
OPS_LOG_E(context_->GetNodeName(), "Variate needCoreNum cannot be 0.");
|
||||
return;
|
||||
}
|
||||
int64_t perCoreElements = (needCoreNum == 0) ? 0 : (totalLength_ / needCoreNum);
|
||||
|
||||
@@ -82,7 +82,7 @@ __aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>
|
||||
this->CopyIn();
|
||||
this->Compute();
|
||||
|
||||
// vaild expert equal zero
|
||||
// valid expert equal zero
|
||||
if (this->needCoreNum_ < 1) {
|
||||
if (this->blockIdx_ == 0) {
|
||||
if (this->rowIdxType_ == GATHER) {
|
||||
|
||||
@@ -85,7 +85,7 @@ __aicore__ inline void MoeCustomFullLoadStaticQuant<T>::Process()
|
||||
this->CopyIn();
|
||||
this->Compute();
|
||||
|
||||
// vaild expert equal zero
|
||||
// valid expert equal zero
|
||||
if (this->needCoreNum_ < 1) {
|
||||
if (this->blockIdx_ == 0) {
|
||||
if (this->rowIdxType_ == GATHER) {
|
||||
|
||||
@@ -78,7 +78,7 @@ __aicore__ inline void MoeCustomFullLoadUnquantized<T>::Process()
|
||||
this->CopyIn();
|
||||
this->Compute();
|
||||
|
||||
// vaild expert equal zero
|
||||
// valid expert equal zero
|
||||
if (this->needCoreNum_ < 1) {
|
||||
if (this->blockIdx_ == 0) {
|
||||
if (this->rowIdxType_ == GATHER) {
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* funtion: aclnnNotifyDispatchGetWorkspaceSize
|
||||
/* function: aclnnNotifyDispatchGetWorkspaceSize
|
||||
* parameters :
|
||||
* sendData : required
|
||||
* tokenPerExpertData : required
|
||||
@@ -40,7 +40,7 @@ aclnnStatus aclnnNotifyDispatchGetWorkspaceSize(
|
||||
uint64_t *workspaceSize,
|
||||
aclOpExecutor **executor);
|
||||
|
||||
/* funtion: aclnnNotifyDispatch
|
||||
/* function: aclnnNotifyDispatch
|
||||
* parameters :
|
||||
* workspace : workspace memory addr(input).
|
||||
* workspaceSize : size of workspace(input).
|
||||
|
||||
@@ -1095,7 +1095,7 @@ ge::graphStatus SFATilingCheck::CheckActualSeqLens()
|
||||
if (std::string(opParamInfo_.layoutKV) == "TND" && opParamInfo_.actualSeqLengths.tensor == nullptr) {
|
||||
OPS_LOG_E(opName_,
|
||||
"when the layout of key and value is TND, "
|
||||
"the actualSeqLengths of key and value shoule not be empty.");
|
||||
"the actualSeqLengths of key and value should not be empty.");
|
||||
return ge::GRAPH_PARAM_INVALID;
|
||||
}
|
||||
if (ge::GRAPH_SUCCESS != CheckActualSeqLensDType() ||
|
||||
|
||||
@@ -116,7 +116,7 @@ template <typename T, typename... Args> constexpr uint64_t RecursiveSum(T templa
|
||||
// Format: 表示当前tiling key支持的Format, 使用枚举InputLayout表示,占一个十进制位
|
||||
// Sparse: 表示当前tiling key是否支持Sparse,使用枚举SparseCapability表示,占一个十进制位
|
||||
// 其余特化场景,定义自己的位域和值
|
||||
// usage: get tilingKey from inputed types
|
||||
// usage: get tilingKey from inputted types
|
||||
// uint64_t tilingKey = GET_FLASHATTENTION_TILINGKEY(AxisEnum::AXIS_S1, AxisEnum::AXIS_S2, AxisEnum::AXIS_N2,
|
||||
// SupportedDtype::FLOAT32, InputLayout::BSH, SparseCapability::SUPPORT_ALL)
|
||||
|
||||
@@ -126,7 +126,7 @@ template <typename... Args> constexpr uint64_t GET_TILINGKEY(Args... templateIds
|
||||
return TILINGKEYOFFSET + RecursiveSum(templateIds...);
|
||||
}
|
||||
|
||||
// usage: get tilingKey from inputed types
|
||||
// usage: get tilingKey from inputted types
|
||||
// uint64_t tilingKey = TILINGKEY(S2, S1, N2, FLOAT32, BSND, ALL)
|
||||
|
||||
#define TILINGKEY(ub2, ub1, block, dtype, layout, sparse) \
|
||||
|
||||
@@ -38,7 +38,7 @@ Given that PCP and DCP behave similarly for KV cache sharding, we refer to them
|
||||
|
||||
As illustrated, a virtual block is defined in the block table, where blocks within the same CP device group form a virtual block. The virtual block size is `virtual_block_size = block_size * cp_size`.
|
||||
|
||||
For any token `x`, referencing the folloing figure, its (virtual) block index is `x // virtual_block_size`, and the offset within the virtual block is `offset_within_virtual_block = x % virtual_block_size`.
|
||||
For any token `x`, referencing the following figure, its (virtual) block index is `x // virtual_block_size`, and the offset within the virtual block is `offset_within_virtual_block = x % virtual_block_size`.
|
||||
The local block index is `local_block_index = offset_within_virtual_block // cp_kv_cache_interleave_size`, and the device number is `target_rank = local_block_index % cp_size`.
|
||||
The offset within the local block is `(local_block_index // cp_size) * cp_kv_cache_interleave_size + offset_within_virtual_block % cp_kv_cache_interleave_size`.
|
||||
|
||||
|
||||
@@ -699,7 +699,7 @@ The performance result is:
|
||||
|
||||
**Input/Output**: 3.5k/1.5k
|
||||
|
||||
**Performance**: TTFT = 6.16s, TPOT = 48.82ms, Average performance of each card is 478 TPS (Token Per Secon).
|
||||
**Performance**: TTFT = 6.16s, TPOT = 48.82ms, Average performance of each card is 478 TPS (Token Per Second).
|
||||
|
||||
### Using vLLM Benchmark
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
## v0.13.0rc2 - 2026.01.24
|
||||
|
||||
This is the second release candidate of v0.13.0 for vLLM Ascend. In this rc relesae, we fixed lots of bugs and improved the performance of many models. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/v0.13.0/) to get started. Any feedback is welcome to help us to improve the final version of v0.13.0.
|
||||
This is the second release candidate of v0.13.0 for vLLM Ascend. In this rc release, we fixed lots of bugs and improved the performance of many models. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/v0.13.0/) to get started. Any feedback is welcome to help us to improve the final version of v0.13.0.
|
||||
|
||||
### Highlights
|
||||
|
||||
@@ -19,7 +19,7 @@ We mainly focus on quality and performance improvement in this release. The spec
|
||||
|
||||
### Model Support
|
||||
|
||||
- LongCat-Flash is supproted now.[#3833](https://github.com/vllm-project/vllm-ascend/pull/3833)
|
||||
- LongCat-Flash is supported now.[#3833](https://github.com/vllm-project/vllm-ascend/pull/3833)
|
||||
- minimax_m2 is supported now. [#5624](https://github.com/vllm-project/vllm-ascend/pull/5624)
|
||||
- Support for cross-attention and whisper models [#5592](https://github.com/vllm-project/vllm-ascend/pull/5592)
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ Here is an example guiding how to use `launch_online_dp.py` to launch external d
|
||||
`run_dp_template.sh` is an template script used to launch each dp vllm instance separately. It will be called by `launch_online_dp.py` in multi threads and most of its configurations are set by `launch_online_dp.py`. Parameters you need to set manually include:
|
||||
|
||||
1. The IP and socket_ifname of your machine. If running on multi-nodes, please make sure the scripts on each node has been set with correct IP and socket_ifname of that node.
|
||||
2. vLLM serving related parameters including model_path and other configurations. Note that port, dp-related parammeters and tp_size is set by `launch_online_dp.py`, all the other vLLM parameters in this file only serve as an example and you are free to modify them according to your purpose.
|
||||
2. vLLM serving related parameters including model_path and other configurations. Note that port, dp-related parameters and tp_size is set by `launch_online_dp.py`, all the other vLLM parameters in this file only serve as an example and you are free to modify them according to your purpose.
|
||||
|
||||
### Run `launch_online_dp.py` with CL arguments
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ locale = "en"
|
||||
extend-ignore-identifiers-re = [".*Unc.*", ".*_thw",
|
||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
|
||||
".*ot.*", ".*[Tt]h[rR].*"]
|
||||
extend-ignore-words-re = ["CANN", "cann","ND"]
|
||||
extend-ignore-words-re = ["CANN", "cann","ND","alog"]
|
||||
extend-ignore-re = []
|
||||
|
||||
[default.extend-identifiers]
|
||||
|
||||
@@ -33,5 +33,5 @@ class NPUWorker310(NPUWorker):
|
||||
self.model_runner = NPUModelRunner310(self.vllm_config, self.device)
|
||||
|
||||
def _warm_up_atb(self):
|
||||
# 310p device donot support torch_npu._npu_matmul_add_fp32 atb ops
|
||||
# 310p device do not support torch_npu._npu_matmul_add_fp32 atb ops
|
||||
logger.info("Skip warm-up atb ops for 310P device")
|
||||
|
||||
Reference in New Issue
Block a user