import os os.environ['EXPERT_PARALLEL_EN'] = "True" from vllm import LLM, SamplingParams # Sample prompts. prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B" tp_size = 2 moe_ep_size=2 is_check_act_range = True input_seq_len=64 output_seq_len=1 batch=1 # max_position_embedding=1024 max_model_len=input_seq_len + output_seq_len # if max_model_len < max_position_embedding: # max_model_len = max_position_embedding max_num_batched_tokens=input_seq_len * batch if max_model_len > max_num_batched_tokens: max_num_batched_tokens=max_model_len max_num_seqs = batch if __name__ == '__main__': # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8) # Create an LLM. llm = LLM(model=model_dir, trust_remote_code=True, enforce_eager=True, dtype='bfloat16', max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, max_num_seqs=max_num_seqs, tensor_parallel_size=tp_size, moe_ep_size=moe_ep_size, ) if is_check_act_range: llm.llm_engine.model_executor._run_workers("setup_smooth_hook", is_save_moe_info=True) llm.llm_engine.model_executor._run_workers("remove_hooks") act_range = llm.llm_engine.model_executor._run_workers("get_act_range") print(f"len(act_range)={len(act_range)}") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) # Print the outputs. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")