diff --git a/examples/offline_distributed_inference_npu.py b/examples/offline_distributed_inference_npu.py index 8e503ad..8853378 100644 --- a/examples/offline_distributed_inference_npu.py +++ b/examples/offline_distributed_inference_npu.py @@ -30,9 +30,9 @@ prompts = [ sampling_params = SamplingParams(max_tokens=100, temperature=0.0) # Create an LLM. llm = LLM( - model="facebook/opt-125m", + model="Qwen/Qwen2.5-0.5B-Instruct", tensor_parallel_size=2, - distributed_executor_backend="ray", + distributed_executor_backend="mp", trust_remote_code=True, ) diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 785492c..25fabfd 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -24,7 +24,6 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ -from transformers import AutoTokenizer from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.utils import FlexibleArgumentParser @@ -41,28 +40,6 @@ question_per_audio_count = { # Unless specified, these settings have been tested to work on a single L4. -# Ultravox 0.3 -def run_ultravox(question: str, audio_count: int): - model_name = "fixie-ai/ultravox-v0_3" - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [{ - 'role': 'user', - 'content': "<|audio|>\n" * audio_count + question - }] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - trust_remote_code=True, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids - - # Qwen2-Audio def run_qwen2_audio(question: str, audio_count: int): model_name = "Qwen/Qwen2-Audio-7B-Instruct" @@ -85,11 +62,7 @@ def run_qwen2_audio(question: str, audio_count: int): return llm, prompt, stop_token_ids -# TODO (cmq): test ultravox -model_example_map = { - # "ultravox": run_ultravox, - "qwen2_audio": run_qwen2_audio -} +model_example_map = {"qwen2_audio": run_qwen2_audio} def main(args): diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py index 10c2c6e..cb39639 100644 --- a/examples/offline_inference_npu.py +++ b/examples/offline_inference_npu.py @@ -29,7 +29,7 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.0) # Create an LLM. -llm = LLM(model="facebook/opt-125m") +llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params)