From 47f688a2f0063b1d142eff8493722bc270915895 Mon Sep 17 00:00:00 2001 From: yangqinghao-cmss Date: Sat, 2 Aug 2025 16:51:22 +0800 Subject: [PATCH] Change retrieving remote files to local retrieval. (#2141) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Using vllm's AudioAsset class to retrieve remote audio files(https://vllm-public-assets.s3.us-west-2.amazonaws.com) is not feasible in some cases; it is recommended to switch to local retrieval. ### How was this patch tested? vllm:main vllm:ascend:main results: ```bash Adding requests: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.62s/it] Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00, 3.01s/it, est. speed input: 79.03 toks/s, output: 6.31 toks/s] generated_text: The sport referenced is soccer, and the nursery rhyme is 'Hey Diddle Diddle'. ``` - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad57f23f6a528ab01066998b41796a44340fd43d --------- Signed-off-by: yangqinghao-cmss --- examples/offline_inference_audio_language.py | 40 ++++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 03bb1cb..99a565b 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -25,21 +25,32 @@ on HuggingFace model repository. """ import os +import argparse + +from vllm.assets.audio import AudioAsset +try: + import librosa +except ImportError: + raise Exception("Can't import librosa, please ensure it's installed") from vllm import LLM, SamplingParams -from vllm.assets.audio import AudioAsset os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] -question_per_audio_count = { - 1: "What is recited in the audio?", - 2: "What sport and what nursery rhyme are referenced?" -} +def prepare_inputs(audio_count: int, audio_path1: str, audio_path2: str): + use_vllm_audio_assert = True if audio_path1 == "mary_had_lamb" and audio_path2 == "winning_call" else False + if use_vllm_audio_assert: + audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] + else: + audio_assets = [librosa.load(audio_path1, sr=None), librosa.load(audio_path2, sr=None)] + + question_per_audio_count = { + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" + } -def prepare_inputs(audio_count: int): audio_in_prompt = "".join([ f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) @@ -52,7 +63,7 @@ def prepare_inputs(audio_count: int): mm_data = { "audio": - [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]] + audio_assets if not use_vllm_audio_assert else [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]] } # Merge text prompt and audio data into inputs @@ -60,7 +71,7 @@ def prepare_inputs(audio_count: int): return inputs -def main(audio_count: int): +def main(audio_count: int, audio_path1: str, audio_path2: str): # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. @@ -71,7 +82,7 @@ def main(audio_count: int): limit_mm_per_prompt={"audio": audio_count}, enforce_eager=True) - inputs = prepare_inputs(audio_count) + inputs = prepare_inputs(audio_count, audio_path1, audio_path2) sampling_params = SamplingParams(temperature=0.2, max_tokens=64, @@ -81,9 +92,14 @@ def main(audio_count: int): for o in outputs: generated_text = o.outputs[0].text - print(generated_text) + print("generated_text:", generated_text) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Arguments of rank table generator", ) + parser.add_argument("--audio-path1", type=str, default="mary_had_lamb") + parser.add_argument("--audio-path2", type=str, default="winning_call") + args = parser.parse_args() + audio_count = 2 - main(audio_count) + main(audio_count, args.audio_path1, args.audio_path2)