Change retrieving remote files to local retrieval. (#2141)
### What this PR does / why we need it?
Using vllm's AudioAsset class to retrieve remote audio
files(https://vllm-public-assets.s3.us-west-2.amazonaws.com) is not
feasible in some cases; it is recommended to switch to local retrieval.
### How was this patch tested?
vllm:main
vllm:ascend:main
results:
```bash
Adding requests: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.62s/it]
Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00, 3.01s/it, est. speed input: 79.03 toks/s, output: 6.31 toks/s]
generated_text: The sport referenced is soccer, and the nursery rhyme is 'Hey Diddle Diddle'.
```
- vLLM version: v0.10.0
- vLLM main:
ad57f23f6a
---------
Signed-off-by: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com>
This commit is contained in:
@@ -25,21 +25,32 @@ on HuggingFace model repository.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from vllm.assets.audio import AudioAsset
|
||||||
|
try:
|
||||||
|
import librosa
|
||||||
|
except ImportError:
|
||||||
|
raise Exception("Can't import librosa, please ensure it's installed")
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.audio import AudioAsset
|
|
||||||
|
|
||||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
|
||||||
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
|
||||||
question_per_audio_count = {
|
|
||||||
1: "What is recited in the audio?",
|
|
||||||
2: "What sport and what nursery rhyme are referenced?"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
def prepare_inputs(audio_count: int, audio_path1: str, audio_path2: str):
|
||||||
|
use_vllm_audio_assert = True if audio_path1 == "mary_had_lamb" and audio_path2 == "winning_call" else False
|
||||||
|
if use_vllm_audio_assert:
|
||||||
|
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||||
|
else:
|
||||||
|
audio_assets = [librosa.load(audio_path1, sr=None), librosa.load(audio_path2, sr=None)]
|
||||||
|
|
||||||
|
question_per_audio_count = {
|
||||||
|
1: "What is recited in the audio?",
|
||||||
|
2: "What sport and what nursery rhyme are referenced?"
|
||||||
|
}
|
||||||
|
|
||||||
def prepare_inputs(audio_count: int):
|
|
||||||
audio_in_prompt = "".join([
|
audio_in_prompt = "".join([
|
||||||
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
||||||
for idx in range(audio_count)
|
for idx in range(audio_count)
|
||||||
@@ -52,7 +63,7 @@ def prepare_inputs(audio_count: int):
|
|||||||
|
|
||||||
mm_data = {
|
mm_data = {
|
||||||
"audio":
|
"audio":
|
||||||
[asset.audio_and_sample_rate for asset in audio_assets[:audio_count]]
|
audio_assets if not use_vllm_audio_assert else [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Merge text prompt and audio data into inputs
|
# Merge text prompt and audio data into inputs
|
||||||
@@ -60,7 +71,7 @@ def prepare_inputs(audio_count: int):
|
|||||||
return inputs
|
return inputs
|
||||||
|
|
||||||
|
|
||||||
def main(audio_count: int):
|
def main(audio_count: int, audio_path1: str, audio_path2: str):
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
# lower-end GPUs.
|
# lower-end GPUs.
|
||||||
# Unless specified, these settings have been tested to work on a single L4.
|
# Unless specified, these settings have been tested to work on a single L4.
|
||||||
@@ -71,7 +82,7 @@ def main(audio_count: int):
|
|||||||
limit_mm_per_prompt={"audio": audio_count},
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
enforce_eager=True)
|
enforce_eager=True)
|
||||||
|
|
||||||
inputs = prepare_inputs(audio_count)
|
inputs = prepare_inputs(audio_count, audio_path1, audio_path2)
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.2,
|
sampling_params = SamplingParams(temperature=0.2,
|
||||||
max_tokens=64,
|
max_tokens=64,
|
||||||
@@ -81,9 +92,14 @@ def main(audio_count: int):
|
|||||||
|
|
||||||
for o in outputs:
|
for o in outputs:
|
||||||
generated_text = o.outputs[0].text
|
generated_text = o.outputs[0].text
|
||||||
print(generated_text)
|
print("generated_text:", generated_text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Arguments of rank table generator", )
|
||||||
|
parser.add_argument("--audio-path1", type=str, default="mary_had_lamb")
|
||||||
|
parser.add_argument("--audio-path2", type=str, default="winning_call")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
audio_count = 2
|
audio_count = 2
|
||||||
main(audio_count)
|
main(audio_count, args.audio_path1, args.audio_path2)
|
||||||
|
|||||||
Reference in New Issue
Block a user