forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
### 简介
|
||||
|
||||
该example是vLLM中进行Expert Parallel的实验,mlu_hijack是对仓库代码的劫持,避免修改主仓库代码
|
||||
|
||||
### 支持模型
|
||||
|
||||
- qwen2_moe
|
||||
- mixtral
|
||||
- custom model
|
||||
- deepseek_v2
|
||||
|
||||
### 支持板卡
|
||||
|
||||
300系列设备只能用于功能测试,性能测试需要其他系列设备。
|
||||
|
||||
### 运行demo
|
||||
```python
|
||||
python examples/cambricon_custom_func/expert_parallel/offline_inference.py
|
||||
```
|
||||
|
||||
### 使用Expert Parallel特性
|
||||
|
||||
- 设置环境变量export EXPERT_PARALLEL_EN=1|True|true|TRUE, LLM主接口传入tensor_parallel_size的同时,传入moe_tp_size或moe_ep_size,或两者都传;
|
||||
- 若只传moe_tp_size和moe_ep_size中的一个,另一个等于tensor_parallel_size除以传入其中一个的除数,所以必须保证传入数可以被tensor_parallel_size整除;
|
||||
- 若moe_tp_size和moe_ep_size都传入,则必须保证moe_tp_size * moe_ep_size == tensor_parallel_size;
|
||||
- 若moe_tp_size和moe_ep_size都不传,则它们默认值等于-1,即不开启专家并行;
|
||||
@@ -0,0 +1,133 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm output -rf
|
||||
mkdir output
|
||||
|
||||
DATA_DIR=/data
|
||||
MODELS_DEEPSEEK_V2=(
|
||||
"${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2"
|
||||
)
|
||||
|
||||
MODELS=(${MODELS_DEEPSEEK_V2[@]})
|
||||
|
||||
# 定义变量
|
||||
use_ray=0
|
||||
use_eager=0
|
||||
use_pp=0
|
||||
# context parameter
|
||||
input_sizes=(1024)
|
||||
output_sizes=(1)
|
||||
# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
|
||||
batch_sizes=(1 4 8 16 32)
|
||||
|
||||
# decoder parameter
|
||||
# input_sizes=(1)
|
||||
# output_sizes=(128)
|
||||
# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048)
|
||||
# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048)
|
||||
|
||||
tp_sizes=(8)
|
||||
moe_ep_sizes=(8 -1)
|
||||
pp_sizes=(1)
|
||||
|
||||
if [ $use_pp -gt 0 ]; then
|
||||
tp_sizes=(1)
|
||||
moe_ep_sizes=(-1)
|
||||
pp_sizes=(8)
|
||||
BENCHMARK_CMD=benchmarks/benchmark_throughput.py
|
||||
benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine"
|
||||
else
|
||||
BENCHMARK_CMD=benchmarks/benchmark_latency.py
|
||||
benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average"
|
||||
fi
|
||||
|
||||
max_position_embeddings=163840
|
||||
|
||||
#export MLU_VISIBLE_DEVICES=4,5,6,7
|
||||
export EXPERT_PARALLEL_EN=true
|
||||
export VLLM_LATENCY_DEBUG=true
|
||||
export VLLM_GRAPH_DEBUG=false
|
||||
# export VLLM_DUMP_MLU_INFO=true
|
||||
export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv
|
||||
|
||||
ray_option=""
|
||||
if [ $use_ray -gt 0 ]; then
|
||||
ray_option="--distributed-executor-backend ray --ray-workers-use-nsight"
|
||||
fi
|
||||
eager_option=""
|
||||
if [ $use_eager -gt 0 ]; then
|
||||
eager_option="--enforce-eager"
|
||||
fi
|
||||
|
||||
# 遍历所有组合
|
||||
for HF_MODEL in "${MODELS[@]}"; do
|
||||
quantization_option=""
|
||||
if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then
|
||||
quantization_option="--quantization=smoothquant"
|
||||
fi
|
||||
for tp_size in "${tp_sizes[@]}"; do
|
||||
for moe_ep_size in "${moe_ep_sizes[@]}"; do
|
||||
for pp_size in "${pp_sizes[@]}"; do
|
||||
for input_size in "${input_sizes[@]}"; do
|
||||
for output_size in "${output_sizes[@]}"; do
|
||||
for batch_size in "${batch_sizes[@]}"; do
|
||||
max_seq_len_to_capture=$(expr $input_size \+ $output_size)
|
||||
max_num_batched_tokens=$(expr $batch_size \* $input_size)
|
||||
max_model_len=$max_seq_len_to_capture
|
||||
if [ $max_model_len -gt $max_position_embeddings ]; then
|
||||
continue
|
||||
fi
|
||||
# max_num_seqs=256
|
||||
# if [ $max_num_seqs -lt $batch_size ]; then
|
||||
# max_num_seqs=$batch_size
|
||||
# fi
|
||||
max_num_seqs=$batch_size
|
||||
if [ $max_model_len -gt $max_num_batched_tokens ]; then
|
||||
max_num_batched_tokens=$max_model_len
|
||||
fi
|
||||
if [ $max_num_seqs -gt $max_num_batched_tokens ]; then
|
||||
max_num_batched_tokens=$max_num_seqs
|
||||
fi
|
||||
|
||||
pp_option="--pipeline-parallel-size ${pp_size}"
|
||||
tp_option="-tp ${tp_size}"
|
||||
ep_option="--moe-ep-size ${moe_ep_size}"
|
||||
batch_size_option=""
|
||||
if [ $use_pp -le 0 ]; then
|
||||
batch_size_option="--batch-size ${batch_size}"
|
||||
fi
|
||||
|
||||
hf_model_name=$(basename "${HF_MODEL}")
|
||||
LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log
|
||||
echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}"
|
||||
python3 ${BENCHMARK_CMD} \
|
||||
${benchmark_option} \
|
||||
--trust-remote-code \
|
||||
--max-num-batched-tokens ${max_num_batched_tokens} \
|
||||
--max-model-len ${max_model_len} \
|
||||
--block-size 16 \
|
||||
--model ${HF_MODEL} \
|
||||
--tokenizer ${HF_MODEL} \
|
||||
--dtype bfloat16 \
|
||||
--input-len ${input_size} \
|
||||
--output-len ${output_size} \
|
||||
${pp_option} ${tp_option} ${ep_option} \
|
||||
--max-seq-len-to-capture ${max_seq_len_to_capture} \
|
||||
--max-num-seqs ${max_num_seqs} \
|
||||
${batch_size_option} \
|
||||
${eager_option} ${ray_option} ${quantization_option} \
|
||||
2>&1 | tee ${LOG_FILE}
|
||||
# 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks
|
||||
if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then
|
||||
echo "Found one or more specified errors in the log file."
|
||||
break
|
||||
else
|
||||
echo "No specified errors found."
|
||||
fi
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
@@ -0,0 +1,147 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm output -rf
|
||||
mkdir output
|
||||
|
||||
DATA_DIR=/data
|
||||
MODELS_DEEPSEEK_V2=(
|
||||
"${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2"
|
||||
)
|
||||
|
||||
MODELS=(${MODELS_DEEPSEEK_V2[@]})
|
||||
|
||||
# 定义变量
|
||||
use_ray=0
|
||||
use_eager=0
|
||||
use_pp=0
|
||||
use_kernel_analysis=0
|
||||
# context parameter
|
||||
input_sizes=(1024)
|
||||
output_sizes=(1)
|
||||
# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
|
||||
batch_sizes=(1 4 8 16 32)
|
||||
|
||||
# decoder parameter
|
||||
# input_sizes=(1)
|
||||
# output_sizes=(128)
|
||||
# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048)
|
||||
# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048)
|
||||
|
||||
tp_sizes=(8)
|
||||
moe_ep_sizes=(8 -1)
|
||||
pp_sizes=(1)
|
||||
|
||||
if [ $use_pp -gt 0 ]; then
|
||||
tp_sizes=(1)
|
||||
moe_ep_sizes=(-1)
|
||||
pp_sizes=(8)
|
||||
BENCHMARK_CMD=benchmarks/benchmark_throughput.py
|
||||
benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine"
|
||||
else
|
||||
BENCHMARK_CMD=benchmarks/benchmark_latency.py
|
||||
benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average"
|
||||
fi
|
||||
|
||||
max_position_embeddings=163840
|
||||
|
||||
#export MLU_VISIBLE_DEVICES=4,5,6,7
|
||||
export EXPERT_PARALLEL_EN=true
|
||||
export VLLM_LATENCY_DEBUG=true
|
||||
export VLLM_GRAPH_DEBUG=false
|
||||
# export VLLM_DUMP_MLU_INFO=true
|
||||
export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv
|
||||
|
||||
ray_option=""
|
||||
if [ $use_ray -gt 0 ]; then
|
||||
ray_option="--distributed-executor-backend ray --ray-workers-use-nsight"
|
||||
fi
|
||||
|
||||
record_option=""
|
||||
if [ $use_kernel_analysis -gt 0 ]; then
|
||||
# ref: https://wiki.cambricon.com/pages/viewpage.action?pageId=434445235
|
||||
export CNPERF_KERNEL_ANALYSIS=1
|
||||
record_option="--pmu --capture_range=cnpx --cnpx_include kangpengtao --cnpx_exclude kangpengtao_exec --events tp_core__write_bytes,tp_core__read_bytes,tp_memcore__write_bytes,tp_memcore__read_bytes,tp_core__lt_cycles,tp_core__csimd_pre_cycles,tp_core__csimd_post_cycles"
|
||||
use_eager=1
|
||||
fi
|
||||
|
||||
eager_option=""
|
||||
if [ $use_eager -gt 0 ]; then
|
||||
eager_option="--enforce-eager"
|
||||
fi
|
||||
|
||||
# 遍历所有组合
|
||||
for HF_MODEL in "${MODELS[@]}"; do
|
||||
quantization_option=""
|
||||
if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then
|
||||
quantization_option="--quantization=smoothquant"
|
||||
fi
|
||||
for tp_size in "${tp_sizes[@]}"; do
|
||||
for moe_ep_size in "${moe_ep_sizes[@]}"; do
|
||||
for pp_size in "${pp_sizes[@]}"; do
|
||||
for input_size in "${input_sizes[@]}"; do
|
||||
for output_size in "${output_sizes[@]}"; do
|
||||
for batch_size in "${batch_sizes[@]}"; do
|
||||
max_seq_len_to_capture=$(expr $input_size \+ $output_size)
|
||||
max_num_batched_tokens=$(expr $batch_size \* $input_size)
|
||||
max_model_len=$max_seq_len_to_capture
|
||||
if [ $max_model_len -gt $max_position_embeddings ]; then
|
||||
continue
|
||||
fi
|
||||
# max_num_seqs=256
|
||||
# if [ $max_num_seqs -lt $batch_size ]; then
|
||||
# max_num_seqs=$batch_size
|
||||
# fi
|
||||
max_num_seqs=$batch_size
|
||||
if [ $max_model_len -gt $max_num_batched_tokens ]; then
|
||||
max_num_batched_tokens=$max_model_len
|
||||
fi
|
||||
if [ $max_num_seqs -gt $max_num_batched_tokens ]; then
|
||||
max_num_batched_tokens=$max_num_seqs
|
||||
fi
|
||||
|
||||
pp_option="--pipeline-parallel-size ${pp_size}"
|
||||
tp_option="-tp ${tp_size}"
|
||||
ep_option="--moe-ep-size ${moe_ep_size}"
|
||||
batch_size_option=""
|
||||
if [ $use_pp -le 0 ]; then
|
||||
batch_size_option="--batch-size ${batch_size}"
|
||||
fi
|
||||
|
||||
hf_model_name=$(basename "${HF_MODEL}")
|
||||
LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log
|
||||
echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}"
|
||||
dltrace_data_name="dltrace_data_${hf_model_name}_${tp_size}_${moe_ep_size}_${pp_size}_${input_size}_${output_size}_${batch_size}_${max_model_len}_${max_num_batched_tokens}"
|
||||
rm dltrace_data -rf
|
||||
rm cnperf_data_* -rf
|
||||
CNPERF_VLOG_LEVEL=0-40 cnperf-cli record ${record_option} python3 ${BENCHMARK_CMD} \
|
||||
--trust-remote-code \
|
||||
--max-num-batched-tokens ${max_num_batched_tokens} \
|
||||
--max-model-len ${max_model_len} \
|
||||
--block-size 16 \
|
||||
--model ${HF_MODEL} \
|
||||
--tokenizer ${HF_MODEL} \
|
||||
--dtype bfloat16 \
|
||||
--input-len ${input_size} \
|
||||
--output-len ${output_size} \
|
||||
${pp_option} ${tp_option} ${ep_option} \
|
||||
--max-seq-len-to-capture ${max_seq_len_to_capture} \
|
||||
--max-num-seqs ${max_num_seqs} \
|
||||
${batch_size_option} \
|
||||
${eager_option} ${ray_option} ${quantization_option} \
|
||||
2>&1 | tee ${LOG_FILE}
|
||||
# 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks
|
||||
if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then
|
||||
echo "Found one or more specified errors in the log file."
|
||||
break
|
||||
else
|
||||
echo "No specified errors found."
|
||||
fi
|
||||
mv dltrace_data ${dltrace_data_name}
|
||||
mv cnperf_data_* ${dltrace_data_name}/
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
@@ -0,0 +1,34 @@
|
||||
#/bin/bash
|
||||
|
||||
# export EXPERT_PARALLEL_EN=True
|
||||
# export VLLM_LATENCY_DEBUG=True
|
||||
|
||||
rm output/client -rf
|
||||
mkdir -p output/client
|
||||
|
||||
PORT=32345
|
||||
MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
|
||||
input_sizes=(1024)
|
||||
output_sizes=(1)
|
||||
# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40)
|
||||
batch_sizes=(32)
|
||||
for input_size in "${input_sizes[@]}"; do
|
||||
for output_size in "${output_sizes[@]}"; do
|
||||
for batch_size in "${batch_sizes[@]}"; do
|
||||
hf_model_name=$(basename "${HF_MODEL}")
|
||||
LOG_FILE=output/client/${hf_model_name}_${input_size}_${output_size}_bs_${batch_size}.log
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model ${MODEL_PATH} \
|
||||
--trust-remote-code \
|
||||
--dataset-name random \
|
||||
--num-prompts 1000 \
|
||||
--port ${PORT} \
|
||||
--request-rate inf \
|
||||
--random_input_len $input_size \
|
||||
--random-output-len ${output_size} \
|
||||
--max-concurrency ${batch_size} \
|
||||
2>&1 | tee ${LOG_FILE}
|
||||
done
|
||||
done
|
||||
done
|
||||
@@ -0,0 +1,2 @@
|
||||
print("Apply Expert Parallel Demo!")
|
||||
from . import model_executor
|
||||
@@ -0,0 +1,5 @@
|
||||
from .layers import sparse_moe_mlp
|
||||
from .models import custom
|
||||
from .models import mixtral
|
||||
from .models import qwen2_moe
|
||||
from .models import deepseek_v2
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Inference-only MOE model.
|
||||
|
||||
Tensor Parallel evenly splits each expert's weight and distributes them to different ranks,
|
||||
which means each rank holds partial weight of all experts.
|
||||
While Expert Parallel evenly distributes some of the experts' full weight to different ranks,
|
||||
which means each rank holds part of the experts' full weight.
|
||||
|
||||
As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts,
|
||||
then computes using the partial weights, while for Expert Parallel, each rank only receives
|
||||
part of tokens' hidden states for experts on this rank, then computes using the full weights.
|
||||
|
||||
When both Tensor Parallel and Expert Parallel are enabled, each rank handles
|
||||
a portion of the expert weights matrices (as in EP mode) and these weights are further sliced
|
||||
across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks,
|
||||
enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
get_tensor_model_parallel_group)
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group,
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group)
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm_mlu._mlu_utils import get_device_major_capability
|
||||
|
||||
|
||||
def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__(
|
||||
self,
|
||||
num_experts: int,
|
||||
top_k: int,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
up_proj_name: str,
|
||||
is_gated: bool,
|
||||
down_proj_name: str,
|
||||
has_bias: bool,
|
||||
skip_bias_add: bool = False,
|
||||
renormalize:bool = False,
|
||||
hidden_act: str = "silu",
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
is_use_fused_moe: bool = False,
|
||||
expert_group: int = 1,
|
||||
topk_group: int = 1,
|
||||
):
|
||||
super(SparseMoeMlp, self).__init__()
|
||||
self.tp_rank = get_tensor_model_parallel_rank()
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
self.tp_group = get_tensor_model_parallel_group()
|
||||
self.num_total_experts = num_experts
|
||||
self.top_k = top_k
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.up_proj_name = up_proj_name
|
||||
self.is_gated = is_gated
|
||||
self.down_proj_name = down_proj_name
|
||||
self.has_bias = has_bias
|
||||
self.renormalize = renormalize
|
||||
self.hidden_act = hidden_act
|
||||
self.quant_config = quant_config
|
||||
self.is_use_fused_moe = is_use_fused_moe
|
||||
self.expert_group = expert_group
|
||||
self.topk_group = topk_group
|
||||
if get_device_major_capability() == 3:
|
||||
self.is_use_fused_moe = False
|
||||
|
||||
if params_dtype is None:
|
||||
params_dtype = torch.get_default_dtype()
|
||||
self.params_dtype = params_dtype
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add moe relative distribution
|
||||
'''
|
||||
self.moe_tp_size = get_moe_tensor_parallel_world_size()
|
||||
self.moe_tp_rank = get_moe_tensor_parallel_rank()
|
||||
self.moe_tp_group = get_moe_tensor_parallel_group()
|
||||
self.moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
self.moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
self.moe_ep_group = get_moe_expert_parallel_group()
|
||||
|
||||
# NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would
|
||||
# contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr
|
||||
self.skip_bias_add = True if self.moe_tp_rank > 0 else False
|
||||
|
||||
assert self.num_total_experts >= self.moe_ep_size, (
|
||||
f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}")
|
||||
|
||||
assert self.intermediate_size % self.moe_tp_size == 0, (
|
||||
f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0")
|
||||
|
||||
self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size
|
||||
if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size:
|
||||
self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size
|
||||
|
||||
self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
self.end_expert_id = self.start_expert_id + self.num_experts_per_rank
|
||||
|
||||
# Gate always runs at half / full precision for now.
|
||||
self.gate = ReplicatedLinear(self.hidden_size,
|
||||
self.num_total_experts,
|
||||
bias=False,
|
||||
params_dtype=self.params_dtype,
|
||||
quant_config=None)
|
||||
self.experts = nn.ModuleList([
|
||||
FeedForward(hidden_size=self.hidden_size,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
up_proj_name=self.up_proj_name,
|
||||
is_gated=self.is_gated,
|
||||
down_proj_name=self.down_proj_name,
|
||||
bias=self.has_bias,
|
||||
quant_config=self.quant_config,
|
||||
skip_bias_add=self.skip_bias_add,
|
||||
reduce_results=False,
|
||||
tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank)
|
||||
])
|
||||
|
||||
self.init_pack_param()
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(SparseMoeMlp,
|
||||
SparseMoeMlp.__init__,
|
||||
vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__)
|
||||
@@ -0,0 +1,183 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing import Optional
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
from vllm_mlu.transformers_utils.configs import CustomConfig
|
||||
from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm_mlu.model_executor.models.layer_utils import (
|
||||
decoder_layer_forward_base, is_per_tensor_smoothquant,
|
||||
is_per_token_smoothquant, quant_fusion_with_rmsnorm,
|
||||
quant_fusion_with_layernorm)
|
||||
|
||||
|
||||
class CustomMoeBlock(SparseMoeMlp):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: CustomConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
):
|
||||
super().__init__(num_experts=config.num_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.moe_intermediate_size,
|
||||
up_proj_name="gate_up_proj",
|
||||
is_gated=config.is_gated,
|
||||
down_proj_name="down_proj",
|
||||
has_bias=config.mlp_bias,
|
||||
skip_bias_add=False,
|
||||
renormalize=config.norm_topk_prob,
|
||||
hidden_act=config.hidden_act,
|
||||
params_dtype=None,
|
||||
quant_config=quant_config,
|
||||
is_use_fused_moe=True)
|
||||
|
||||
self.config = config
|
||||
self.rank = self.tp_rank
|
||||
self.shared_expert = None
|
||||
self.shared_expert_gate = None
|
||||
if config.shared_expert_intermediate_size > 0:
|
||||
self.shared_expert = FeedForward(hidden_size=config.hidden_size,
|
||||
intermediate_size=config.shared_expert_intermediate_size,
|
||||
hidden_act=config.hidden_act,
|
||||
up_proj_name='gate_up_proj',
|
||||
is_gated=config.is_gated,
|
||||
down_proj_name='down_proj',
|
||||
bias=config.mlp_bias,
|
||||
quant_config=quant_config,
|
||||
reduce_results=False)
|
||||
self.shared_expert_gate = ReplicatedLinear(config.hidden_size,
|
||||
1,
|
||||
bias=False,
|
||||
params_dtype=self.params_dtype,
|
||||
quant_config=None)
|
||||
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
num_tokens, hidden_dim = hidden_states.shape
|
||||
hidden_states = hidden_states.view(-1, hidden_dim)
|
||||
shared_output = None
|
||||
if self.shared_expert is not None:
|
||||
shared_output = self.shared_expert(hidden_states)
|
||||
if self.shared_expert_gate is not None:
|
||||
gate_output = self.shared_expert_gate(hidden_states)
|
||||
shared_output = F.sigmoid(gate_output[0]) * shared_output
|
||||
|
||||
# router_logits: (num_tokens, n_experts)
|
||||
router_logits, _ = self.gate(hidden_states)
|
||||
residual_ = None if self.rank > 0 else residual
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: modify bt_ops.fused_moe to forward_experts
|
||||
'''
|
||||
final_hidden_states = self.forward_experts(hidden_states, router_logits, residual)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
if shared_output is not None:
|
||||
final_hidden_states = final_hidden_states + shared_output
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add comment to explain use_parallel_residual usage
|
||||
'''
|
||||
# use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x))
|
||||
# use_parallel_residual = False:
|
||||
# if apply_residual_connection_post_layernorm:
|
||||
# x_attn = ln1(x) + attn(ln1(x))
|
||||
# x_mlp = ln2(x_attn) + mlp(ln2(x_attn))
|
||||
# else:
|
||||
# x_attn = x + attn(ln1(x))
|
||||
# x_mlp = x_attn + mlp(ln2(x_attn))
|
||||
# When use_parallel_residual = True, x is shared between attn and mlp, so we only need to
|
||||
# reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here
|
||||
# But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce
|
||||
# when mlp is finished.
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
reduce_results = (self.config.use_parallel_residual == False)
|
||||
if reduce_results and self.tp_size > 1:
|
||||
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
|
||||
|
||||
return final_hidden_states.view(num_tokens, hidden_dim)
|
||||
|
||||
|
||||
def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__(
|
||||
self,
|
||||
config: CustomConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
) -> None:
|
||||
super(CustomDecoderLayer, self).__init__()
|
||||
self.config = config
|
||||
self.self_attn = CustomAttention(
|
||||
config=config,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False)
|
||||
is_gated = getattr(config, "is_gated", False)
|
||||
|
||||
if config.num_experts is not None:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: nothing changed, only use the CustomMoeBlock class in this file
|
||||
'''
|
||||
self.mlp = CustomMoeBlock(config=config,
|
||||
quant_config=quant_config)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
else:
|
||||
self.mlp = FeedForward(hidden_size=config.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
hidden_act=self.config.hidden_act,
|
||||
up_proj_name='up_proj',
|
||||
is_gated=is_gated,
|
||||
down_proj_name='down_proj',
|
||||
bias=mlp_bias,
|
||||
quant_config=quant_config,
|
||||
skip_bias_add=(self.config.use_parallel_residual and mlp_bias),
|
||||
reduce_results = (self.config.use_parallel_residual == False))
|
||||
|
||||
self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
|
||||
self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps)
|
||||
|
||||
# perf per-tensor sq cases by fusing quantization in layernorm
|
||||
self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and
|
||||
not self.config.apply_residual_connection_post_layernorm)
|
||||
self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and
|
||||
not self.config.apply_residual_connection_post_layernorm)
|
||||
if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases:
|
||||
self.self_attn.qkv_proj.quant_method.skip_quant_input = True
|
||||
self.quant_fusion_attn_layernorm = None
|
||||
self.is_moe = config.num_experts is not None
|
||||
self.use_rmsnorm = self.config.norm_type == "rmsnorm"
|
||||
if not self.is_moe:
|
||||
self.mlp.up_proj.quant_method.skip_quant_input = True
|
||||
self.quant_fusion_mlp_layernorm = None
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(CustomDecoderLayer,
|
||||
CustomDecoderLayer.__init__,
|
||||
vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__)
|
||||
@@ -0,0 +1,222 @@
|
||||
|
||||
import re
|
||||
import torch
|
||||
from torch import nn
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.model_executor.models.utils import is_pp_missing_parameter
|
||||
|
||||
from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
|
||||
from vllm_mlu.model_executor.models.deepseek_v2 import DeepseekV2MoE
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
|
||||
|
||||
def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.moe_intermediate_size,
|
||||
up_proj_name="gate_up_proj",
|
||||
is_gated=True,
|
||||
down_proj_name="down_proj",
|
||||
has_bias=False,
|
||||
skip_bias_add=False,
|
||||
renormalize=config.norm_topk_prob,
|
||||
hidden_act=config.hidden_act,
|
||||
params_dtype=None,
|
||||
quant_config=quant_config,
|
||||
is_use_fused_moe=True,
|
||||
expert_group=config.n_group,
|
||||
topk_group=config.topk_group)
|
||||
self.config = config
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
self.n_shared_experts = config.n_shared_experts
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
if self.moe_tp_size > config.n_routed_experts:
|
||||
raise ValueError(
|
||||
f"Moe Tensor parallel size {self.moe_tp_size} is greater than "
|
||||
f"the number of experts {config.n_routed_experts}.")
|
||||
|
||||
if config.hidden_act != "silu":
|
||||
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
|
||||
"Only silu is supported for now.")
|
||||
|
||||
self.gate = ReplicatedLinear(config.hidden_size,
|
||||
config.n_routed_experts,
|
||||
bias=False,
|
||||
quant_config=None,
|
||||
prefix=f"{prefix}.gate")
|
||||
if config.n_shared_experts is not None:
|
||||
intermediate_size = (config.moe_intermediate_size *
|
||||
config.n_shared_experts)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace MLP with FeedForward.
|
||||
'''
|
||||
self.shared_experts = FeedForward(hidden_size=config.hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
hidden_act=config.hidden_act,
|
||||
up_proj_name='gate_up_proj',
|
||||
is_gated=True,
|
||||
down_proj_name='down_proj',
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
reduce_results=False)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack params and cal start expert id
|
||||
'''
|
||||
for name, m in self.model.named_modules():
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.pack_params()
|
||||
|
||||
# expert parallel modification start
|
||||
moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
num_total_experts = self.config.n_routed_experts
|
||||
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
|
||||
# expert parallel modification end
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete expert_params_mapping for no useless
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
params_dict = dict(self.named_parameters())
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace expert_id in weight to named_expert_id in params_dict
|
||||
'''
|
||||
if start_expert_id > 0 and "mlp.experts." in name:
|
||||
expert_str = re.search(r'experts\.\d+', name).group(0)
|
||||
expert_id=int(expert_str.split(".")[1])
|
||||
named_expert_id = expert_id - start_expert_id
|
||||
old_expert_name = f"experts.{expert_id}"
|
||||
new_expert_name = f"experts.{named_expert_id}"
|
||||
name = name.replace(old_expert_name, new_expert_name)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
# Skip non-stacked layers and experts (experts handled below).
|
||||
if weight_name not in name:
|
||||
continue
|
||||
# We have mlp.experts[0].gate_proj in the checkpoint.
|
||||
# Since we handle the experts below in expert_params_mapping,
|
||||
# we need to skip here BEFORE we update the name, otherwise
|
||||
# name will be updated to mlp.experts[0].gate_up_proj, which
|
||||
# will then be updated below in expert_params_mapping
|
||||
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
|
||||
'''
|
||||
name = name.replace(weight_name, param_name)
|
||||
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
|
||||
if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(DeepseekV2MoE,
|
||||
DeepseekV2MoE.__init__,
|
||||
vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__)
|
||||
MluHijackObject.apply_hijack(DeepseekV2ForCausalLM,
|
||||
DeepseekV2ForCausalLM.load_weights,
|
||||
vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights)
|
||||
@@ -0,0 +1,143 @@
|
||||
import torch
|
||||
import re
|
||||
import vllm
|
||||
from torch import nn
|
||||
from typing import List, Optional, Tuple, Iterable
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.model_executor.models.mixtral import MixtralForCausalLM
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.model_executor.models.utils import is_pp_missing_parameter
|
||||
|
||||
|
||||
def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights(
|
||||
self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack params and cal start expert id
|
||||
'''
|
||||
for name, m in self.model.named_modules():
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.pack_params()
|
||||
# expert parallel modification start
|
||||
moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
num_total_experts = self.config.num_local_experts
|
||||
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
|
||||
# expert parallel modification end
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("w13", "w1", 0),
|
||||
("w13", "w3", 1),
|
||||
]
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete expert_params_mapping for no useless
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace expert_id in weight to named_expert_id in params_dict
|
||||
'''
|
||||
if start_expert_id > 0 and "block_sparse_moe.experts." in name:
|
||||
expert_str = re.search(r'experts\.\d+', name).group(0)
|
||||
expert_id=int(expert_str.split(".")[1])
|
||||
named_expert_id = expert_id - start_expert_id
|
||||
old_expert_name = f"experts.{expert_id}"
|
||||
new_expert_name = f"experts.{named_expert_id}"
|
||||
name = name.replace(old_expert_name, new_expert_name)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
# Remapping the name of FP8 kv-scale.
|
||||
name = maybe_remap_kv_scale_name(name, params_dict)
|
||||
if name is None:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("block_sparse_moe.experts." in name) and (name not in params_dict)):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(MixtralForCausalLM,
|
||||
MixtralForCausalLM.load_weights,
|
||||
vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights)
|
||||
@@ -0,0 +1,179 @@
|
||||
import torch
|
||||
import re
|
||||
from typing import Optional, Iterable, Tuple
|
||||
from vllm_mlu._mlu_utils import *
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import (
|
||||
get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size)
|
||||
from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp
|
||||
from vllm.utils import print_warning_once
|
||||
from vllm.model_executor.models.utils import is_pp_missing_parameter
|
||||
|
||||
|
||||
def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights(
|
||||
self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: pack params and cal start expert id
|
||||
'''
|
||||
for name, m in self.model.named_modules():
|
||||
if isinstance(m, SparseMoeMlp):
|
||||
m.pack_params()
|
||||
|
||||
# expert parallel modification start
|
||||
moe_ep_rank = get_moe_expert_parallel_rank()
|
||||
moe_ep_size = get_moe_expert_parallel_world_size()
|
||||
num_total_experts = self.config.num_experts
|
||||
start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size)
|
||||
# expert parallel modification end
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete expert_params_mapping for no useless
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: replace expert_id in weight to named_expert_id in params_dict
|
||||
'''
|
||||
if start_expert_id > 0 and "mlp.experts." in name:
|
||||
expert_str = re.search(r'experts\.\d+', name).group(0)
|
||||
expert_id=int(expert_str.split(".")[1])
|
||||
named_expert_id = expert_id - start_expert_id
|
||||
old_expert_name = f"experts.{expert_id}"
|
||||
new_expert_name = f"experts.{named_expert_id}"
|
||||
name = name.replace(old_expert_name, new_expert_name)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete if "mlp.experts" in name: continue condition
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
name = name.replace(weight_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
# Skip layers on other devices.
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition and delete useless if name not in params_dict: continue condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: delete for mapping in expert_params_mapping condition
|
||||
'''
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if ((name.endswith(".bias") or name.endswith("_bias"))
|
||||
and name not in params_dict):
|
||||
continue
|
||||
# Skip layers on other devices.
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
# Remapping the name of FP8 kv-scale.
|
||||
if name.endswith("kv_scale"):
|
||||
remapped_kv_scale_name = name.replace(
|
||||
".kv_scale", ".attn.kv_scale")
|
||||
if remapped_kv_scale_name not in params_dict:
|
||||
print_warning_once(
|
||||
"Found kv scale in the checkpoint "
|
||||
f"(e.g. {name}), but not found the expected "
|
||||
f"name in the model "
|
||||
f"(e.g. {remapped_kv_scale_name}). "
|
||||
"kv-scale is not loaded.")
|
||||
continue
|
||||
else:
|
||||
name = remapped_kv_scale_name
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add expert skiped condition
|
||||
'''
|
||||
# Skip experts that are not assigned to this worker.
|
||||
if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name)
|
||||
and name not in params_dict):
|
||||
continue
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(Qwen2MoeForCausalLM,
|
||||
Qwen2MoeForCausalLM.load_weights,
|
||||
vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights)
|
||||
@@ -0,0 +1,61 @@
|
||||
import os
|
||||
os.environ['EXPERT_PARALLEL_EN'] = "True"
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B"
|
||||
tp_size = 2
|
||||
moe_ep_size=2
|
||||
is_check_act_range = True
|
||||
input_seq_len=64
|
||||
output_seq_len=1
|
||||
batch=1
|
||||
# max_position_embedding=1024
|
||||
max_model_len=input_seq_len + output_seq_len
|
||||
# if max_model_len < max_position_embedding:
|
||||
# max_model_len = max_position_embedding
|
||||
max_num_batched_tokens=input_seq_len * batch
|
||||
if max_model_len > max_num_batched_tokens:
|
||||
max_num_batched_tokens=max_model_len
|
||||
max_num_seqs = batch
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model=model_dir,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
dtype='bfloat16',
|
||||
max_model_len=max_model_len,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tp_size,
|
||||
moe_ep_size=moe_ep_size,
|
||||
)
|
||||
|
||||
if is_check_act_range:
|
||||
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", is_save_moe_info=True)
|
||||
|
||||
llm.llm_engine.model_executor._run_workers("remove_hooks")
|
||||
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
|
||||
print(f"len(act_range)={len(act_range)}")
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
@@ -0,0 +1,48 @@
|
||||
#/bin/bash
|
||||
|
||||
rm output/server -rf
|
||||
mkdir -p output/server
|
||||
|
||||
PORT=32345
|
||||
use_ray=0
|
||||
use_pp=1
|
||||
use_eager=0
|
||||
|
||||
eager_option=""
|
||||
if [ $use_eager -gt 0 ]; then
|
||||
eager_option="--enforce-eager"
|
||||
fi
|
||||
|
||||
ray_option=""
|
||||
if [ $use_ray -gt 0 ]; then
|
||||
ray_option="--worker-use-ray"
|
||||
ray stop --force
|
||||
fi
|
||||
|
||||
export VLLM_ENGINE_ITERATION_TIMEOUT_S=180
|
||||
MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp"
|
||||
|
||||
if [ $use_pp -gt 0 ]; then
|
||||
parallel_option="--pipeline-parallel-size=8"
|
||||
else
|
||||
parallel_option="--tensor-parallel-size=8"
|
||||
fi
|
||||
|
||||
# TP8
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--disable-log-requests \
|
||||
--port ${PORT} \
|
||||
--model ${MODEL_PATH} \
|
||||
--trust-remote-code \
|
||||
--swap-space 16 \
|
||||
${parallel_option} \
|
||||
--max-num-batched-tokens=40960 \
|
||||
--max-model-len=1034 \
|
||||
--block-size=16 \
|
||||
--dtype=bfloat16 \
|
||||
--max-seq-len-to-capture=1034 \
|
||||
--max-num-seqs=40 \
|
||||
--quantization=smoothquant \
|
||||
${eager_option} \
|
||||
${ray_option} \
|
||||
2>&1 | tee output/server/server.log
|
||||
Reference in New Issue
Block a user