support deepseek quant & mix-parallel with graphmode (#585)
### What this PR does / why we need it? 1. support deepseek with w8a8 quant; 2. support deepseek with mix-parallel(multi-DP, EP+TP); 3. support deepseek with graphmode. --------- Signed-off-by: wen-jie666 <wenjie39@huawei.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: wen-jie666 <wenjie39@huawei.com>
This commit is contained in:
@@ -11,8 +11,6 @@
|
||||
import gc
|
||||
import os
|
||||
|
||||
VLLM_ENABLE_GRAPGH_MODE = os.environ.get("VLLM_ENABLE_GRAPH_MODE") == "1"
|
||||
|
||||
|
||||
def main():
|
||||
dp_rank = int(os.environ['RANK'])
|
||||
@@ -20,8 +18,8 @@ def main():
|
||||
dp_size = int(os.environ['WORLD_SIZE'])
|
||||
master_addr = os.environ['MASTER_ADDR']
|
||||
master_port = os.environ['MASTER_PORT']
|
||||
tp_size = 4
|
||||
etp_size = 2
|
||||
tp_size = 1
|
||||
etp_size = 1
|
||||
|
||||
os.environ["VLLM_DP_RANK"] = str(dp_rank)
|
||||
os.environ["VLLM_DP_SIZE"] = str(dp_size)
|
||||
@@ -58,15 +56,15 @@ def main():
|
||||
max_tokens=4,
|
||||
min_tokens=4)
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
tensor_parallel_size=tp_size,
|
||||
trust_remote_code=True,
|
||||
expert_tensor_parallel_size=etp_size,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=num_seqs,
|
||||
compilation_config=1 if VLLM_ENABLE_GRAPGH_MODE else 0,
|
||||
)
|
||||
llm = LLM(model="deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
tensor_parallel_size=tp_size,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=num_seqs,
|
||||
additional_config={
|
||||
'expert_tensor_parallel_size': etp_size,
|
||||
'enable_graph_mode': False,
|
||||
})
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
|
||||
@@ -6,15 +6,13 @@ export HCCL_SOCKET_IFNAME=${ifname}
|
||||
# dp_size = node_size * dp_per_node
|
||||
node_size=1
|
||||
node_rank=0
|
||||
dp_per_node=2
|
||||
dp_per_node=4
|
||||
master_addr=127.0.0.1
|
||||
master_port=12345
|
||||
|
||||
rm -rf ./.torchair_cache/
|
||||
rm -rf ./dynamo_*
|
||||
rm -rf /root/ascend/log/debug/plog/*
|
||||
export VLLM_ENABLE_GRAPH_MODE=0
|
||||
export VLLM_ENABLE_MC2=0
|
||||
|
||||
torchrun --nproc_per_node ${dp_per_node} --nnodes ${node_size} \
|
||||
--node_rank ${node_rank} --master_addr ${master_addr} --master_port ${master_port} \
|
||||
|
||||
Reference in New Issue
Block a user