Files
enginex-mlu370-vllm/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/offline_inference.py

62 lines
1.9 KiB
Python
Raw Normal View History

2026-02-04 17:22:39 +08:00
import os
os.environ['EXPERT_PARALLEL_EN'] = "True"
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B"
tp_size = 2
moe_ep_size=2
is_check_act_range = True
input_seq_len=64
output_seq_len=1
batch=1
# max_position_embedding=1024
max_model_len=input_seq_len + output_seq_len
# if max_model_len < max_position_embedding:
# max_model_len = max_position_embedding
max_num_batched_tokens=input_seq_len * batch
if max_model_len > max_num_batched_tokens:
max_num_batched_tokens=max_model_len
max_num_seqs = batch
if __name__ == '__main__':
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8)
# Create an LLM.
llm = LLM(model=model_dir,
trust_remote_code=True,
enforce_eager=True,
dtype='bfloat16',
max_model_len=max_model_len,
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tp_size,
moe_ep_size=moe_ep_size,
)
if is_check_act_range:
llm.llm_engine.model_executor._run_workers("setup_smooth_hook", is_save_moe_info=True)
llm.llm_engine.model_executor._run_workers("remove_hooks")
act_range = llm.llm_engine.model_executor._run_workers("get_act_range")
print(f"len(act_range)={len(act_range)}")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")