add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,26 @@
### 简介
该example是vLLM中进行Expert Parallel的实验mlu_hijack是对仓库代码的劫持避免修改主仓库代码
### 支持模型
- qwen2_moe
- mixtral
- custom model
- deepseek_v2
### 支持板卡
300系列设备只能用于功能测试性能测试需要其他系列设备。
### 运行demo
```python
python examples/cambricon_custom_func/expert_parallel/offline_inference.py
```
### 使用Expert Parallel特性
- 设置环境变量export EXPERT_PARALLEL_EN=1|True|true|TRUE LLM主接口传入tensor_parallel_size的同时传入moe_tp_size或moe_ep_size或两者都传
- 若只传moe_tp_size和moe_ep_size中的一个另一个等于tensor_parallel_size除以传入其中一个的除数所以必须保证传入数可以被tensor_parallel_size整除
- 若moe_tp_size和moe_ep_size都传入则必须保证moe_tp_size * moe_ep_size == tensor_parallel_size
- 若moe_tp_size和moe_ep_size都不传则它们默认值等于-1即不开启专家并行

View File

@@ -0,0 +1,133 @@
#!/bin/bash
rm output -rf
mkdir output
DATA_DIR=/data
MODELS_DEEPSEEK_V2=(
"${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2"
)
MODELS=(${MODELS_DEEPSEEK_V2[@]})
# 定义变量
use_ray=0
use_eager=0
use_pp=0
# context parameter
input_sizes=(1024)
output_sizes=(1)
# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
batch_sizes=(1 4 8 16 32)
# decoder parameter
# input_sizes=(1)
# output_sizes=(128)
# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048)
# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048)
tp_sizes=(8)
moe_ep_sizes=(8 -1)
pp_sizes=(1)
if [ $use_pp -gt 0 ]; then
tp_sizes=(1)
moe_ep_sizes=(-1)
pp_sizes=(8)
BENCHMARK_CMD=benchmarks/benchmark_throughput.py
benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine"
else
BENCHMARK_CMD=benchmarks/benchmark_latency.py
benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average"
fi
max_position_embeddings=163840
#export MLU_VISIBLE_DEVICES=4,5,6,7
export EXPERT_PARALLEL_EN=true
export VLLM_LATENCY_DEBUG=true
export VLLM_GRAPH_DEBUG=false
# export VLLM_DUMP_MLU_INFO=true
export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv
ray_option=""
if [ $use_ray -gt 0 ]; then
ray_option="--distributed-executor-backend ray --ray-workers-use-nsight"
fi
eager_option=""
if [ $use_eager -gt 0 ]; then
eager_option="--enforce-eager"
fi
# 遍历所有组合
for HF_MODEL in "${MODELS[@]}"; do
quantization_option=""
if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then
quantization_option="--quantization=smoothquant"
fi
for tp_size in "${tp_sizes[@]}"; do
for moe_ep_size in "${moe_ep_sizes[@]}"; do
for pp_size in "${pp_sizes[@]}"; do
for input_size in "${input_sizes[@]}"; do
for output_size in "${output_sizes[@]}"; do
for batch_size in "${batch_sizes[@]}"; do
max_seq_len_to_capture=$(expr $input_size \+ $output_size)
max_num_batched_tokens=$(expr $batch_size \* $input_size)
max_model_len=$max_seq_len_to_capture
if [ $max_model_len -gt $max_position_embeddings ]; then
continue
fi
# max_num_seqs=256
# if [ $max_num_seqs -lt $batch_size ]; then
# max_num_seqs=$batch_size
# fi
max_num_seqs=$batch_size
if [ $max_model_len -gt $max_num_batched_tokens ]; then
max_num_batched_tokens=$max_model_len
fi
if [ $max_num_seqs -gt $max_num_batched_tokens ]; then
max_num_batched_tokens=$max_num_seqs
fi
pp_option="--pipeline-parallel-size ${pp_size}"
tp_option="-tp ${tp_size}"
ep_option="--moe-ep-size ${moe_ep_size}"
batch_size_option=""
if [ $use_pp -le 0 ]; then
batch_size_option="--batch-size ${batch_size}"
fi
hf_model_name=$(basename "${HF_MODEL}")
LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log
echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}"
python3 ${BENCHMARK_CMD} \
${benchmark_option} \
--trust-remote-code \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 16 \
--model ${HF_MODEL} \
--tokenizer ${HF_MODEL} \
--dtype bfloat16 \
--input-len ${input_size} \
--output-len ${output_size} \
${pp_option} ${tp_option} ${ep_option} \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--max-num-seqs ${max_num_seqs} \
${batch_size_option} \
${eager_option} ${ray_option} ${quantization_option} \
2>&1 | tee ${LOG_FILE}
# 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks
if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then
echo "Found one or more specified errors in the log file."
break
else
echo "No specified errors found."
fi
done
done
done
done
done
done
done

View File

@@ -0,0 +1,147 @@
#!/bin/bash
rm output -rf
mkdir output
DATA_DIR=/data
MODELS_DEEPSEEK_V2=(
"${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2"
)
MODELS=(${MODELS_DEEPSEEK_V2[@]})
# 定义变量
use_ray=0
use_eager=0
use_pp=0
use_kernel_analysis=0
# context parameter
input_sizes=(1024)
output_sizes=(1)
# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
batch_sizes=(1 4 8 16 32)
# decoder parameter
# input_sizes=(1)
# output_sizes=(128)
# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048)
# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048)
tp_sizes=(8)
moe_ep_sizes=(8 -1)
pp_sizes=(1)
if [ $use_pp -gt 0 ]; then
tp_sizes=(1)
moe_ep_sizes=(-1)
pp_sizes=(8)
BENCHMARK_CMD=benchmarks/benchmark_throughput.py
benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine"
else
BENCHMARK_CMD=benchmarks/benchmark_latency.py
benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average"
fi
max_position_embeddings=163840
#export MLU_VISIBLE_DEVICES=4,5,6,7
export EXPERT_PARALLEL_EN=true
export VLLM_LATENCY_DEBUG=true
export VLLM_GRAPH_DEBUG=false
# export VLLM_DUMP_MLU_INFO=true
export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv
ray_option=""
if [ $use_ray -gt 0 ]; then
ray_option="--distributed-executor-backend ray --ray-workers-use-nsight"
fi
record_option=""
if [ $use_kernel_analysis -gt 0 ]; then
# ref: https://wiki.cambricon.com/pages/viewpage.action?pageId=434445235
export CNPERF_KERNEL_ANALYSIS=1
record_option="--pmu --capture_range=cnpx --cnpx_include kangpengtao --cnpx_exclude kangpengtao_exec --events tp_core__write_bytes,tp_core__read_bytes,tp_memcore__write_bytes,tp_memcore__read_bytes,tp_core__lt_cycles,tp_core__csimd_pre_cycles,tp_core__csimd_post_cycles"
use_eager=1
fi
eager_option=""
if [ $use_eager -gt 0 ]; then
eager_option="--enforce-eager"
fi
# 遍历所有组合
for HF_MODEL in "${MODELS[@]}"; do
quantization_option=""
if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then
quantization_option="--quantization=smoothquant"
fi
for tp_size in "${tp_sizes[@]}"; do
for moe_ep_size in "${moe_ep_sizes[@]}"; do
for pp_size in "${pp_sizes[@]}"; do
for input_size in "${input_sizes[@]}"; do
for output_size in "${output_sizes[@]}"; do
for batch_size in "${batch_sizes[@]}"; do
max_seq_len_to_capture=$(expr $input_size \+ $output_size)
max_num_batched_tokens=$(expr $batch_size \* $input_size)
max_model_len=$max_seq_len_to_capture
if [ $max_model_len -gt $max_position_embeddings ]; then
continue
fi
# max_num_seqs=256
# if [ $max_num_seqs -lt $batch_size ]; then
# max_num_seqs=$batch_size
# fi
max_num_seqs=$batch_size
if [ $max_model_len -gt $max_num_batched_tokens ]; then
max_num_batched_tokens=$max_model_len
fi
if [ $max_num_seqs -gt $max_num_batched_tokens ]; then
max_num_batched_tokens=$max_num_seqs
fi
pp_option="--pipeline-parallel-size ${pp_size}"
tp_option="-tp ${tp_size}"
ep_option="--moe-ep-size ${moe_ep_size}"
batch_size_option=""
if [ $use_pp -le 0 ]; then
batch_size_option="--batch-size ${batch_size}"
fi
hf_model_name=$(basename "${HF_MODEL}")
LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log
echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}"
dltrace_data_name="dltrace_data_${hf_model_name}_${tp_size}_${moe_ep_size}_${pp_size}_${input_size}_${output_size}_${batch_size}_${max_model_len}_${max_num_batched_tokens}"
rm dltrace_data -rf
rm cnperf_data_* -rf
CNPERF_VLOG_LEVEL=0-40 cnperf-cli record ${record_option} python3 ${BENCHMARK_CMD} \
--trust-remote-code \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 16 \
--model ${HF_MODEL} \
--tokenizer ${HF_MODEL} \
--dtype bfloat16 \
--input-len ${input_size} \
--output-len ${output_size} \
${pp_option} ${tp_option} ${ep_option} \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--max-num-seqs ${max_num_seqs} \
${batch_size_option} \
${eager_option} ${ray_option} ${quantization_option} \
2>&1 | tee ${LOG_FILE}
# 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks
if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then
echo "Found one or more specified errors in the log file."
break
else
echo "No specified errors found."
fi
mv dltrace_data ${dltrace_data_name}
mv cnperf_data_* ${dltrace_data_name}/
done
done
done
done
done
done
done

View File

@@ -0,0 +1,34 @@
#/bin/bash
# export EXPERT_PARALLEL_EN=True
# export VLLM_LATENCY_DEBUG=True
rm output/client -rf
mkdir -p output/client
PORT=32345
MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
input_sizes=(1024)
output_sizes=(1)
# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
batch_sizes=(32)
for input_size in "${input_sizes[@]}"; do
for output_size in "${output_sizes[@]}"; do
for batch_size in "${batch_sizes[@]}"; do
hf_model_name=$(basename "${HF_MODEL}")
LOG_FILE=output/client/${hf_model_name}_${input_size}_${output_size}_bs_${batch_size}.log
python benchmarks/benchmark_serving.py \
--backend vllm \
--model ${MODEL_PATH} \
--trust-remote-code \
--dataset-name random \
--num-prompts 1000 \
--port ${PORT} \
--request-rate inf \
--random_input_len $input_size \
--random-output-len ${output_size} \
--max-concurrency ${batch_size} \
2>&1 | tee ${LOG_FILE}
done
done
done

View File

@@ -0,0 +1,2 @@
print("Apply Expert Parallel Demo!")
from . import model_executor

View File

@@ -0,0 +1,5 @@
from .layers import sparse_moe_mlp
from .models import custom
from .models import mixtral
from .models import qwen2_moe
from .models import deepseek_v2

View File

@@ -0,0 +1,142 @@
"""
Inference-only MOE model.
Tensor Parallel evenly splits each expert's weight and distributes them to different ranks,
which means each rank holds partial weight of all experts.
While Expert Parallel evenly distributes some of the experts' full weight to different ranks,
which means each rank holds part of the experts' full weight.
As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts,
then computes using the partial weights, while for Expert Parallel, each rank only receives
part of tokens' hidden states for experts on this rank, then computes using the full weights.
When both Tensor Parallel and Expert Parallel are enabled, each rank handles
a portion of the expert weights matrices (as in EP mode) and these weights are further sliced
across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks,
enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone.
"""
from typing import Optional
import torch
from torch import nn
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_tensor_model_parallel_group)
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group,
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group)
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm_mlu._mlu_utils import get_device_major_capability
def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__(
self,
num_experts: int,
top_k: int,
hidden_size: int,
intermediate_size: int,
up_proj_name: str,
is_gated: bool,
down_proj_name: str,
has_bias: bool,
skip_bias_add: bool = False,
renormalize:bool = False,
hidden_act: str = "silu",
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
is_use_fused_moe: bool = False,
expert_group: int = 1,
topk_group: int = 1,
):
super(SparseMoeMlp, self).__init__()
self.tp_rank = get_tensor_model_parallel_rank()
self.tp_size = get_tensor_model_parallel_world_size()
self.tp_group = get_tensor_model_parallel_group()
self.num_total_experts = num_experts
self.top_k = top_k
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.up_proj_name = up_proj_name
self.is_gated = is_gated
self.down_proj_name = down_proj_name
self.has_bias = has_bias
self.renormalize = renormalize
self.hidden_act = hidden_act
self.quant_config = quant_config
self.is_use_fused_moe = is_use_fused_moe
self.expert_group = expert_group
self.topk_group = topk_group
if get_device_major_capability() == 3:
self.is_use_fused_moe = False
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.params_dtype = params_dtype
'''
=============================
Modify by vllm_mlu
=============================
@brief: add moe relative distribution
'''
self.moe_tp_size = get_moe_tensor_parallel_world_size()
self.moe_tp_rank = get_moe_tensor_parallel_rank()
self.moe_tp_group = get_moe_tensor_parallel_group()
self.moe_ep_size = get_moe_expert_parallel_world_size()
self.moe_ep_rank = get_moe_expert_parallel_rank()
self.moe_ep_group = get_moe_expert_parallel_group()
# NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would
# contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr
self.skip_bias_add = True if self.moe_tp_rank > 0 else False
assert self.num_total_experts >= self.moe_ep_size, (
f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}")
assert self.intermediate_size % self.moe_tp_size == 0, (
f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0")
self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size
if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size:
self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size
self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size)
'''
==================
End of MLU Hijack
==================
'''
self.end_expert_id = self.start_expert_id + self.num_experts_per_rank
# Gate always runs at half / full precision for now.
self.gate = ReplicatedLinear(self.hidden_size,
self.num_total_experts,
bias=False,
params_dtype=self.params_dtype,
quant_config=None)
self.experts = nn.ModuleList([
FeedForward(hidden_size=self.hidden_size,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
up_proj_name=self.up_proj_name,
is_gated=self.is_gated,
down_proj_name=self.down_proj_name,
bias=self.has_bias,
quant_config=self.quant_config,
skip_bias_add=self.skip_bias_add,
reduce_results=False,
tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank)
])
self.init_pack_param()
MluHijackObject.apply_hijack(SparseMoeMlp,
SparseMoeMlp.__init__,
vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__)

View File

@@ -0,0 +1,183 @@
import torch
import torch.nn.functional as F
from typing import Optional
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm_mlu._mlu_utils import *
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm_mlu.transformers_utils.configs import CustomConfig
from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm_mlu.model_executor.models.layer_utils import (
decoder_layer_forward_base, is_per_tensor_smoothquant,
is_per_token_smoothquant, quant_fusion_with_rmsnorm,
quant_fusion_with_layernorm)
class CustomMoeBlock(SparseMoeMlp):
def __init__(
self,
config: CustomConfig,
quant_config: Optional[QuantizationConfig] = None,
):
super().__init__(num_experts=config.num_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
intermediate_size=config.moe_intermediate_size,
up_proj_name="gate_up_proj",
is_gated=config.is_gated,
down_proj_name="down_proj",
has_bias=config.mlp_bias,
skip_bias_add=False,
renormalize=config.norm_topk_prob,
hidden_act=config.hidden_act,
params_dtype=None,
quant_config=quant_config,
is_use_fused_moe=True)
self.config = config
self.rank = self.tp_rank
self.shared_expert = None
self.shared_expert_gate = None
if config.shared_expert_intermediate_size > 0:
self.shared_expert = FeedForward(hidden_size=config.hidden_size,
intermediate_size=config.shared_expert_intermediate_size,
hidden_act=config.hidden_act,
up_proj_name='gate_up_proj',
is_gated=config.is_gated,
down_proj_name='down_proj',
bias=config.mlp_bias,
quant_config=quant_config,
reduce_results=False)
self.shared_expert_gate = ReplicatedLinear(config.hidden_size,
1,
bias=False,
params_dtype=self.params_dtype,
quant_config=None)
def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
num_tokens, hidden_dim = hidden_states.shape
hidden_states = hidden_states.view(-1, hidden_dim)
shared_output = None
if self.shared_expert is not None:
shared_output = self.shared_expert(hidden_states)
if self.shared_expert_gate is not None:
gate_output = self.shared_expert_gate(hidden_states)
shared_output = F.sigmoid(gate_output[0]) * shared_output
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
residual_ = None if self.rank > 0 else residual
'''
=============================
Modify by vllm_mlu
=============================
@brief: modify bt_ops.fused_moe to forward_experts
'''
final_hidden_states = self.forward_experts(hidden_states, router_logits, residual)
'''
==================
End of MLU Hijack
==================
'''
if shared_output is not None:
final_hidden_states = final_hidden_states + shared_output
'''
=============================
Modify by vllm_mlu
=============================
@brief: add comment to explain use_parallel_residual usage
'''
# use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x))
# use_parallel_residual = False:
# if apply_residual_connection_post_layernorm:
# x_attn = ln1(x) + attn(ln1(x))
# x_mlp = ln2(x_attn) + mlp(ln2(x_attn))
# else:
# x_attn = x + attn(ln1(x))
# x_mlp = x_attn + mlp(ln2(x_attn))
# When use_parallel_residual = True, x is shared between attn and mlp, so we only need to
# reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here
# But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce
# when mlp is finished.
'''
==================
End of MLU Hijack
==================
'''
reduce_results = (self.config.use_parallel_residual == False)
if reduce_results and self.tp_size > 1:
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
return final_hidden_states.view(num_tokens, hidden_dim)
def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__(
self,
config: CustomConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
) -> None:
super(CustomDecoderLayer, self).__init__()
self.config = config
self.self_attn = CustomAttention(
config=config,
cache_config=cache_config,
quant_config=quant_config,
)
mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False)
is_gated = getattr(config, "is_gated", False)
if config.num_experts is not None:
'''
=============================
Modify by vllm_mlu
=============================
@brief: nothing changed, only use the CustomMoeBlock class in this file
'''
self.mlp = CustomMoeBlock(config=config,
quant_config=quant_config)
'''
==================
End of MLU Hijack
==================
'''
else:
self.mlp = FeedForward(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=self.config.hidden_act,
up_proj_name='up_proj',
is_gated=is_gated,
down_proj_name='down_proj',
bias=mlp_bias,
quant_config=quant_config,
skip_bias_add=(self.config.use_parallel_residual and mlp_bias),
reduce_results = (self.config.use_parallel_residual == False))
self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
# perf per-tensor sq cases by fusing quantization in layernorm
self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and
not self.config.apply_residual_connection_post_layernorm)
self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and
not self.config.apply_residual_connection_post_layernorm)
if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases:
self.self_attn.qkv_proj.quant_method.skip_quant_input = True
self.quant_fusion_attn_layernorm = None
self.is_moe = config.num_experts is not None
self.use_rmsnorm = self.config.norm_type == "rmsnorm"
if not self.is_moe:
self.mlp.up_proj.quant_method.skip_quant_input = True
self.quant_fusion_mlp_layernorm = None
MluHijackObject.apply_hijack(CustomDecoderLayer,
CustomDecoderLayer.__init__,
vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__)

View File

@@ -0,0 +1,222 @@
import re
import torch
from torch import nn
from typing import Any, Dict, Iterable, List, Optional, Tuple
from transformers import PretrainedConfig
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.model_executor.models.utils import is_pp_missing_parameter
from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
from vllm_mlu.model_executor.models.deepseek_v2 import DeepseekV2MoE
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
intermediate_size=config.moe_intermediate_size,
up_proj_name="gate_up_proj",
is_gated=True,
down_proj_name="down_proj",
has_bias=False,
skip_bias_add=False,
renormalize=config.norm_topk_prob,
hidden_act=config.hidden_act,
params_dtype=None,
quant_config=quant_config,
is_use_fused_moe=True,
expert_group=config.n_group,
topk_group=config.topk_group)
self.config = config
self.routed_scaling_factor = config.routed_scaling_factor
self.n_shared_experts = config.n_shared_experts
self.routed_scaling_factor = config.routed_scaling_factor
if self.moe_tp_size > config.n_routed_experts:
raise ValueError(
f"Moe Tensor parallel size {self.moe_tp_size} is greater than "
f"the number of experts {config.n_routed_experts}.")
if config.hidden_act != "silu":
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
"Only silu is supported for now.")
self.gate = ReplicatedLinear(config.hidden_size,
config.n_routed_experts,
bias=False,
quant_config=None,
prefix=f"{prefix}.gate")
if config.n_shared_experts is not None:
intermediate_size = (config.moe_intermediate_size *
config.n_shared_experts)
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace MLP with FeedForward.
'''
self.shared_experts = FeedForward(hidden_size=config.hidden_size,
intermediate_size=intermediate_size,
hidden_act=config.hidden_act,
up_proj_name='gate_up_proj',
is_gated=True,
down_proj_name='down_proj',
bias=False,
quant_config=quant_config,
reduce_results=False)
'''
==================
End of MLU Hijack
==================
'''
def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack params and cal start expert id
'''
for name, m in self.model.named_modules():
if isinstance(m, SparseMoeMlp):
m.pack_params()
# expert parallel modification start
moe_ep_rank = get_moe_expert_parallel_rank()
moe_ep_size = get_moe_expert_parallel_world_size()
num_total_experts = self.config.n_routed_experts
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
# expert parallel modification end
'''
==================
End of MLU Hijack
==================
'''
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete expert_params_mapping for no useless
'''
'''
==================
End of MLU Hijack
==================
'''
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace expert_id in weight to named_expert_id in params_dict
'''
if start_expert_id > 0 and "mlp.experts." in name:
expert_str = re.search(r'experts\.\d+', name).group(0)
expert_id=int(expert_str.split(".")[1])
named_expert_id = expert_id - start_expert_id
old_expert_name = f"experts.{expert_id}"
new_expert_name = f"experts.{named_expert_id}"
name = name.replace(old_expert_name, new_expert_name)
'''
==================
End of MLU Hijack
==================
'''
for (param_name, weight_name, shard_id) in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below).
if weight_name not in name:
continue
# We have mlp.experts[0].gate_proj in the checkpoint.
# Since we handle the experts below in expert_params_mapping,
# we need to skip here BEFORE we update the name, otherwise
# name will be updated to mlp.experts[0].gate_up_proj, which
# will then be updated below in expert_params_mapping
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
'''
name = name.replace(weight_name, param_name)
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
'''
==================
End of MLU Hijack
==================
'''
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
if is_pp_missing_parameter(name, self):
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
if is_pp_missing_parameter(name, self):
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
'''
==================
End of MLU Hijack
==================
'''
MluHijackObject.apply_hijack(DeepseekV2MoE,
DeepseekV2MoE.__init__,
vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__)
MluHijackObject.apply_hijack(DeepseekV2ForCausalLM,
DeepseekV2ForCausalLM.load_weights,
vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights)

View File

@@ -0,0 +1,143 @@
import torch
import re
import vllm
from torch import nn
from typing import List, Optional, Tuple, Iterable
from vllm_mlu._mlu_utils import *
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.model_executor.models.mixtral import MixtralForCausalLM
from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.model_executor.models.utils import is_pp_missing_parameter
def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights(
self,
weights: Iterable[Tuple[str, torch.Tensor]]):
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack params and cal start expert id
'''
for name, m in self.model.named_modules():
if isinstance(m, SparseMoeMlp):
m.pack_params()
# expert parallel modification start
moe_ep_rank = get_moe_expert_parallel_rank()
moe_ep_size = get_moe_expert_parallel_world_size()
num_total_experts = self.config.num_local_experts
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
# expert parallel modification end
'''
==================
End of MLU Hijack
==================
'''
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("w13", "w1", 0),
("w13", "w3", 1),
]
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete expert_params_mapping for no useless
'''
'''
==================
End of MLU Hijack
==================
'''
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace expert_id in weight to named_expert_id in params_dict
'''
if start_expert_id > 0 and "block_sparse_moe.experts." in name:
expert_str = re.search(r'experts\.\d+', name).group(0)
expert_id=int(expert_str.split(".")[1])
named_expert_id = expert_id - start_expert_id
old_expert_name = f"experts.{expert_id}"
new_expert_name = f"experts.{named_expert_id}"
name = name.replace(old_expert_name, new_expert_name)
'''
==================
End of MLU Hijack
==================
'''
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
if is_pp_missing_parameter(name, self):
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip experts that are not assigned to this worker.
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
if is_pp_missing_parameter(name, self):
continue
# Remapping the name of FP8 kv-scale.
name = maybe_remap_kv_scale_name(name, params_dict)
if name is None:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip experts that are not assigned to this worker.
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
MluHijackObject.apply_hijack(MixtralForCausalLM,
MixtralForCausalLM.load_weights,
vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights)

View File

@@ -0,0 +1,179 @@
import torch
import re
from typing import Optional, Iterable, Tuple
from vllm_mlu._mlu_utils import *
from vllm_mlu.mlu_hijack_utils import MluHijackObject
from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
from vllm.utils import print_warning_once
from vllm.model_executor.models.utils import is_pp_missing_parameter
def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights(
self,
weights: Iterable[Tuple[str, torch.Tensor]]):
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack params and cal start expert id
'''
for name, m in self.model.named_modules():
if isinstance(m, SparseMoeMlp):
m.pack_params()
# expert parallel modification start
moe_ep_rank = get_moe_expert_parallel_rank()
moe_ep_size = get_moe_expert_parallel_world_size()
num_total_experts = self.config.num_experts
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
# expert parallel modification end
'''
==================
End of MLU Hijack
==================
'''
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete expert_params_mapping for no useless
'''
'''
==================
End of MLU Hijack
==================
'''
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: replace expert_id in weight to named_expert_id in params_dict
'''
if start_expert_id > 0 and "mlp.experts." in name:
expert_str = re.search(r'experts\.\d+', name).group(0)
expert_id=int(expert_str.split(".")[1])
named_expert_id = expert_id - start_expert_id
old_expert_name = f"experts.{expert_id}"
new_expert_name = f"experts.{named_expert_id}"
name = name.replace(old_expert_name, new_expert_name)
'''
==================
End of MLU Hijack
==================
'''
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete if "mlp.experts" in name: continue condition
'''
'''
==================
End of MLU Hijack
==================
'''
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
# Skip layers on other devices.
if is_pp_missing_parameter(name, self):
continue
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
'''
# Skip experts that are not assigned to this worker.
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
'''
=============================
Modify by vllm_mlu
=============================
@brief: delete for mapping in expert_params_mapping condition
'''
'''
==================
End of MLU Hijack
==================
'''
# Skip loading extra bias for GPTQ models.
if ((name.endswith(".bias") or name.endswith("_bias"))
and name not in params_dict):
continue
# Skip layers on other devices.
if is_pp_missing_parameter(name, self):
continue
# Remapping the name of FP8 kv-scale.
if name.endswith("kv_scale"):
remapped_kv_scale_name = name.replace(
".kv_scale", ".attn.kv_scale")
if remapped_kv_scale_name not in params_dict:
print_warning_once(
"Found kv scale in the checkpoint "
f"(e.g. {name}), but not found the expected "
f"name in the model "
f"(e.g. {remapped_kv_scale_name}). "
"kv-scale is not loaded.")
continue
else:
name = remapped_kv_scale_name
'''
=============================
Modify by vllm_mlu
=============================
@brief: add expert skiped condition
'''
# Skip experts that are not assigned to this worker.
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
and name not in params_dict):
continue
'''
==================
End of MLU Hijack
==================
'''
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
MluHijackObject.apply_hijack(Qwen2MoeForCausalLM,
Qwen2MoeForCausalLM.load_weights,
vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights)

View File

@@ -0,0 +1,61 @@
import os
os.environ['EXPERT_PARALLEL_EN'] = "True"
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B"
tp_size = 2
moe_ep_size=2
is_check_act_range = True
input_seq_len=64
output_seq_len=1
batch=1
# max_position_embedding=1024
max_model_len=input_seq_len + output_seq_len
# if max_model_len < max_position_embedding:
# max_model_len = max_position_embedding
max_num_batched_tokens=input_seq_len * batch
if max_model_len > max_num_batched_tokens:
max_num_batched_tokens=max_model_len
max_num_seqs = batch
if __name__ == '__main__':
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8)
# Create an LLM.
llm = LLM(model=model_dir,
trust_remote_code=True,
enforce_eager=True,
dtype='bfloat16',
max_model_len=max_model_len,
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tp_size,
moe_ep_size=moe_ep_size,
)
if is_check_act_range:
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", is_save_moe_info=True)
llm.llm_engine.model_executor._run_workers("remove_hooks")
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
print(f"len(act_range)={len(act_range)}")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -0,0 +1,48 @@
#/bin/bash
rm output/server -rf
mkdir -p output/server
PORT=32345
use_ray=0
use_pp=1
use_eager=0
eager_option=""
if [ $use_eager -gt 0 ]; then
eager_option="--enforce-eager"
fi
ray_option=""
if [ $use_ray -gt 0 ]; then
ray_option="--worker-use-ray"
ray stop --force
fi
export VLLM_ENGINE_ITERATION_TIMEOUT_S=180
MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
if [ $use_pp -gt 0 ]; then
parallel_option="--pipeline-parallel-size=8"
else
parallel_option="--tensor-parallel-size=8"
fi
# TP8
python -m vllm.entrypoints.openai.api_server \
--disable-log-requests \
--port ${PORT} \
--model ${MODEL_PATH} \
--trust-remote-code \
--swap-space 16 \
${parallel_option} \
--max-num-batched-tokens=40960 \
--max-model-len=1034 \
--block-size=16 \
--dtype=bfloat16 \
--max-seq-len-to-capture=1034 \
--max-num-seqs=40 \
--quantization=smoothquant \
${eager_option} \
${ray_option} \
2>&1 | tee output/server/server.log