[CI] drop ascend scheduler test (#4582)
let' drop ascend scheduler test first to ensure all function works without it. - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
1
.github/workflows/_e2e_test.yaml
vendored
1
.github/workflows/_e2e_test.yaml
vendored
@@ -94,7 +94,6 @@ jobs:
|
|||||||
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
|
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
|
||||||
pytest -sv tests/e2e/singlecard/test_bge_model.py
|
pytest -sv tests/e2e/singlecard/test_bge_model.py
|
||||||
pytest -sv tests/e2e/singlecard/test_camem.py
|
pytest -sv tests/e2e/singlecard/test_camem.py
|
||||||
pytest -sv tests/e2e/singlecard/test_chunked.py
|
|
||||||
pytest -sv tests/e2e/singlecard/test_embedding.py
|
pytest -sv tests/e2e/singlecard/test_embedding.py
|
||||||
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
|
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
|
||||||
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
||||||
|
|||||||
@@ -29,9 +29,6 @@ ADDITIONAL_CONFIG = [{
|
|||||||
"additional_config": {
|
"additional_config": {
|
||||||
"torchair_graph_config": {
|
"torchair_graph_config": {
|
||||||
"enabled": True
|
"enabled": True
|
||||||
},
|
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
|
|||||||
@@ -15,23 +15,14 @@ def test_e2e_ep_correctness(model_name):
|
|||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||||
with VllmRunner(
|
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||||
model_name,
|
enforce_eager=False) as vllm_model:
|
||||||
tensor_parallel_size=2,
|
|
||||||
additional_config={"ascend_scheduler_config": {
|
|
||||||
"enabled": True
|
|
||||||
}},
|
|
||||||
enforce_eager=False) as vllm_model:
|
|
||||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(model_name,
|
||||||
model_name,
|
tensor_parallel_size=2,
|
||||||
tensor_parallel_size=2,
|
enable_expert_parallel=True,
|
||||||
enable_expert_parallel=True,
|
enforce_eager=False) as vllm_model:
|
||||||
additional_config={"ascend_scheduler_config": {
|
|
||||||
"enabled": True
|
|
||||||
}},
|
|
||||||
enforce_eager=False) as vllm_model:
|
|
||||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
|
|||||||
@@ -49,13 +49,7 @@ def test_generate_with_allgather():
|
|||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True) as vllm_model:
|
||||||
additional_config={
|
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
"chunked_prefill_enabled": False,
|
|
||||||
},
|
|
||||||
}) as vllm_model:
|
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@@ -76,11 +70,5 @@ def test_generate_with_alltoall():
|
|||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True) as vllm_model:
|
||||||
additional_config={
|
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
"chunked_prefill_enabled": False,
|
|
||||||
},
|
|
||||||
}) as vllm_model:
|
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|||||||
@@ -82,9 +82,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
|||||||
"enabled": True,
|
"enabled": True,
|
||||||
},
|
},
|
||||||
"enable_multistream_moe": True,
|
"enable_multistream_moe": True,
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
},
|
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
},
|
},
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
@@ -154,14 +151,9 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
|||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
additional_config={
|
additional_config={"torchair_graph_config": {
|
||||||
"torchair_graph_config": {
|
"enabled": False,
|
||||||
"enabled": False,
|
}},
|
||||||
},
|
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(prompts, max_tokens)
|
vllm_model.generate_greedy(prompts, max_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -84,67 +84,3 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
|||||||
name_0="vllm_output",
|
name_0="vllm_output",
|
||||||
name_1="prefix_cache_output",
|
name_1="prefix_cache_output",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Fix me, the accuracy is not correct")
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
@pytest.mark.parametrize("max_tokens", [50])
|
|
||||||
def test_prefix_cache_with_ascend_scheduler(model: str,
|
|
||||||
max_tokens: int) -> None:
|
|
||||||
|
|
||||||
with VllmRunner(model,
|
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
enforce_eager=False,
|
|
||||||
max_model_len=2048,
|
|
||||||
tensor_parallel_size=2,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
|
|
||||||
|
|
||||||
with VllmRunner(model,
|
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
'enable_prefix_caching': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
enforce_eager=False,
|
|
||||||
max_model_len=2048,
|
|
||||||
tensor_parallel_size=2,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
prefix_cache_output = vllm_model.generate_greedy(
|
|
||||||
INPUT_PROMPTS, max_tokens)
|
|
||||||
|
|
||||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
|
||||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
|
||||||
# with VllmRunner(model,
|
|
||||||
# additional_config={
|
|
||||||
# 'ascend_scheduler_config': {
|
|
||||||
# 'enabled': True,
|
|
||||||
# 'enable_prefix_caching': True,
|
|
||||||
# "enable_chunked_prefill": True,
|
|
||||||
# },
|
|
||||||
# },
|
|
||||||
# enforce_eager=True,
|
|
||||||
# max_model_len=2048,
|
|
||||||
# tensor_parallel_size=2,
|
|
||||||
# gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
|
||||||
# INPUT_PROMPTS, max_tokens)
|
|
||||||
|
|
||||||
check_outputs_equal(
|
|
||||||
outputs_0_lst=vllm_output,
|
|
||||||
outputs_1_lst=prefix_cache_output,
|
|
||||||
name_0="vllm_output",
|
|
||||||
name_1="prefix_cache_output",
|
|
||||||
)
|
|
||||||
|
|
||||||
# check_outputs_equal(
|
|
||||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
|
||||||
# outputs_1_lst=prefix_cache_output,
|
|
||||||
# name_0="chunk_prefill_prefix_cache_output",
|
|
||||||
# name_1="prefix_cache_output",
|
|
||||||
# )
|
|
||||||
|
|||||||
@@ -89,12 +89,6 @@ def test_models_distributed_Qwen3_NEXT_MTP_TP4_SIMILARITY():
|
|||||||
gpu_memory_utilization=0.8,
|
gpu_memory_utilization=0.8,
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
additional_config={
|
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
"enable_chunked_prefill": False
|
|
||||||
}
|
|
||||||
},
|
|
||||||
speculative_config={
|
speculative_config={
|
||||||
"method": "qwen3_next_mtp",
|
"method": "qwen3_next_mtp",
|
||||||
"num_speculative_tokens": 1
|
"num_speculative_tokens": 1
|
||||||
|
|||||||
@@ -44,9 +44,6 @@ def _deepseek_torchair_test_fixture(
|
|||||||
kwargs = {}
|
kwargs = {}
|
||||||
if not use_v1_schduler:
|
if not use_v1_schduler:
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
},
|
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
}
|
}
|
||||||
additional_config.update(**kwargs)
|
additional_config.update(**kwargs)
|
||||||
@@ -121,9 +118,6 @@ def _pangu_torchair_test_fixture(
|
|||||||
|
|
||||||
# torchair is only work without chunked-prefill now
|
# torchair is only work without chunked-prefill now
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
},
|
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
}
|
}
|
||||||
additional_config.update(**kwargs)
|
additional_config.update(**kwargs)
|
||||||
@@ -186,9 +180,6 @@ def _qwen_torchair_test_fixture(
|
|||||||
"torchair_graph_config": {
|
"torchair_graph_config": {
|
||||||
"enabled": False,
|
"enabled": False,
|
||||||
},
|
},
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True,
|
|
||||||
},
|
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -245,9 +236,6 @@ def _deepseek_v2_lite_torchair_test_fixure(
|
|||||||
kwargs = {}
|
kwargs = {}
|
||||||
if not use_v1_schduler:
|
if not use_v1_schduler:
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enable": True,
|
|
||||||
},
|
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
}
|
}
|
||||||
additional_config.update(**kwargs)
|
additional_config.update(**kwargs)
|
||||||
|
|||||||
@@ -73,11 +73,7 @@ async def test_models(model: str, mode: str) -> None:
|
|||||||
"VLLM_RPC_TIMEOUT": "3600000",
|
"VLLM_RPC_TIMEOUT": "3600000",
|
||||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
|
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000"
|
||||||
}
|
}
|
||||||
additional_config: dict[str, Any] = {
|
additional_config: dict[str, Any] = {}
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
}
|
|
||||||
speculative_config = {
|
speculative_config = {
|
||||||
"num_speculative_tokens": 2,
|
"num_speculative_tokens": 2,
|
||||||
"method": "deepseek_mtp"
|
"method": "deepseek_mtp"
|
||||||
|
|||||||
@@ -74,9 +74,6 @@ async def test_models(model: str) -> None:
|
|||||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||||
}
|
}
|
||||||
additional_config = {
|
additional_config = {
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
"torchair_graph_config": {
|
"torchair_graph_config": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"enable_multistream_moe": False,
|
"enable_multistream_moe": False,
|
||||||
|
|||||||
@@ -68,12 +68,7 @@ aisbench_cases75 = [{
|
|||||||
async def test_models(model: str) -> None:
|
async def test_models(model: str) -> None:
|
||||||
port = get_open_port()
|
port = get_open_port()
|
||||||
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
|
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
|
||||||
additional_config = {
|
additional_config = {"enable_weight_nz_layout": True}
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
"enable_weight_nz_layout": True
|
|
||||||
}
|
|
||||||
server_args = [
|
server_args = [
|
||||||
"--quantization", "ascend", "--reasoning-parser", "qwen3",
|
"--quantization", "ascend", "--reasoning-parser", "qwen3",
|
||||||
"--tensor-parallel-size", "4", "--port",
|
"--tensor-parallel-size", "4", "--port",
|
||||||
|
|||||||
@@ -83,8 +83,7 @@ async def test_models(model: str, tp_size: int) -> None:
|
|||||||
"0.9", "--block-size", "128", "--max-num-seqs", "256",
|
"0.9", "--block-size", "128", "--max-num-seqs", "256",
|
||||||
"--enforce-eager", "--max-model-len", "35840",
|
"--enforce-eager", "--max-model-len", "35840",
|
||||||
"--max-num-batched-tokens", "35840", "--additional-config",
|
"--max-num-batched-tokens", "35840", "--additional-config",
|
||||||
'{"ascend_scheduler_config":{"enabled":true},"enable_weight_nz_layout":true}',
|
'{"enable_weight_nz_layout":true}', "--compilation-config",
|
||||||
"--compilation-config",
|
|
||||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
|
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}'
|
||||||
]
|
]
|
||||||
with RemoteOpenAIServer(model,
|
with RemoteOpenAIServer(model,
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ MODES = [
|
|||||||
"single",
|
"single",
|
||||||
"aclgraph",
|
"aclgraph",
|
||||||
"aclgraph_mlapo",
|
"aclgraph_mlapo",
|
||||||
"no_chunkprefill",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
@@ -82,9 +81,6 @@ async def test_models(model: str, mode: str) -> None:
|
|||||||
"method": "deepseek_mtp"
|
"method": "deepseek_mtp"
|
||||||
}
|
}
|
||||||
additional_config = {
|
additional_config = {
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
"torchair_graph_config": {
|
"torchair_graph_config": {
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"enable_multistream_moe": False,
|
"enable_multistream_moe": False,
|
||||||
@@ -112,10 +108,6 @@ async def test_models(model: str, mode: str) -> None:
|
|||||||
if mode == "aclgraph_mlapo":
|
if mode == "aclgraph_mlapo":
|
||||||
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
|
env_dict["VLLM_ASCEND_ENABLE_MLAPO"] = "1"
|
||||||
additional_config["torchair_graph_config"] = {"enabled": False}
|
additional_config["torchair_graph_config"] = {"enabled": False}
|
||||||
if mode == "no_chunkprefill":
|
|
||||||
additional_config["ascend_scheduler_config"] = {"enabled": True}
|
|
||||||
i = server_args.index("--max-num-batched-tokens") + 1
|
|
||||||
server_args[i] = "36864"
|
|
||||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
@@ -134,7 +126,7 @@ async def test_models(model: str, mode: str) -> None:
|
|||||||
choices: list[openai.types.CompletionChoice] = batch.choices
|
choices: list[openai.types.CompletionChoice] = batch.choices
|
||||||
assert choices[0].text, "empty response"
|
assert choices[0].text, "empty response"
|
||||||
print(choices)
|
print(choices)
|
||||||
if mode in ["single", "no_chunkprefill"]:
|
if mode in ["single"]:
|
||||||
return
|
return
|
||||||
# aisbench test
|
# aisbench test
|
||||||
run_aisbench_cases(model,
|
run_aisbench_cases(model,
|
||||||
|
|||||||
@@ -71,9 +71,6 @@ async def test_models(model: str) -> None:
|
|||||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||||
}
|
}
|
||||||
additional_config: dict[str, Any] = {
|
additional_config: dict[str, Any] = {
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
"torchair_graph_config": {
|
"torchair_graph_config": {
|
||||||
"enabled": True
|
"enabled": True
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -92,8 +92,7 @@ async def test_models(model: str, tp_size: int, dp_size: int,
|
|||||||
"--gpu-memory-utilization",
|
"--gpu-memory-utilization",
|
||||||
"0.9",
|
"0.9",
|
||||||
"--additional-config",
|
"--additional-config",
|
||||||
'{"ascend_scheduler_config":{"enabled":true},'
|
'{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
|
||||||
'"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}',
|
|
||||||
]
|
]
|
||||||
if full_graph:
|
if full_graph:
|
||||||
server_args += [
|
server_args += [
|
||||||
|
|||||||
@@ -85,9 +85,8 @@ async def test_models(model: str, tp_size: int) -> None:
|
|||||||
str(tp_size), "--port",
|
str(tp_size), "--port",
|
||||||
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
|
str(port), "--max-model-len", "30000", "--max-num-batched-tokens",
|
||||||
"40000", "--max-num-seqs", "400", "--trust-remote-code",
|
"40000", "--max-num-seqs", "400", "--trust-remote-code",
|
||||||
"--gpu-memory-utilization", "0.8", "--additional-config",
|
"--gpu-memory-utilization", "0.8", "--compilation_config",
|
||||||
'{"ascend_scheduler_config":{"enabled":false}}',
|
'{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||||
"--compilation_config", '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
|
|
||||||
]
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
|
|||||||
@@ -60,11 +60,7 @@ async def test_models(model: str) -> None:
|
|||||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
||||||
}
|
}
|
||||||
additional_config: dict[str, Any] = {
|
additional_config: dict[str, Any] = {}
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
}
|
|
||||||
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
||||||
server_args = [
|
server_args = [
|
||||||
"--quantization", "ascend", "--async-scheduling",
|
"--quantization", "ascend", "--async-scheduling",
|
||||||
|
|||||||
@@ -63,11 +63,6 @@ async def test_models(model: str, mode: str) -> None:
|
|||||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
|
||||||
}
|
}
|
||||||
additional_config: dict[str, Any] = {
|
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": False
|
|
||||||
},
|
|
||||||
}
|
|
||||||
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
|
||||||
server_args = [
|
server_args = [
|
||||||
"--quantization", "ascend", "--async-scheduling",
|
"--quantization", "ascend", "--async-scheduling",
|
||||||
@@ -82,7 +77,6 @@ async def test_models(model: str, mode: str) -> None:
|
|||||||
server_args.extend(
|
server_args.extend(
|
||||||
["--compilation-config",
|
["--compilation-config",
|
||||||
json.dumps(compilation_config)])
|
json.dumps(compilation_config)])
|
||||||
server_args.extend(["--additional-config", json.dumps(additional_config)])
|
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -93,8 +93,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
|
|||||||
server_args.remove(
|
server_args.remove(
|
||||||
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
|
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
|
||||||
)
|
)
|
||||||
server_args.append("--additional-config")
|
|
||||||
server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
|
|
||||||
server_args.append("--enforce-eager")
|
server_args.append("--enforce-eager")
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ deployment:
|
|||||||
--quantization ascend
|
--quantization ascend
|
||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||||
|
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
@@ -51,7 +51,7 @@ deployment:
|
|||||||
--quantization ascend
|
--quantization ascend
|
||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
acc:
|
acc:
|
||||||
case_type: accuracy
|
case_type: accuracy
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--enforce-eager
|
--enforce-eager
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||||
|
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
@@ -53,5 +53,5 @@ deployment:
|
|||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--enforce-eager
|
--enforce-eager
|
||||||
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
||||||
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
|
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
@@ -80,7 +80,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||||
@@ -111,7 +111,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||||
@@ -141,7 +141,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
perf:
|
||||||
case_type: performance
|
case_type: performance
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||||
|
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
@@ -79,7 +79,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||||
@@ -110,7 +110,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||||
@@ -140,7 +140,7 @@ deployment:
|
|||||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||||
}'
|
}'
|
||||||
--additional-config
|
--additional-config
|
||||||
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
perf:
|
perf:
|
||||||
case_type: performance
|
case_type: performance
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ deployment:
|
|||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--no-enable-prefix-caching
|
--no-enable-prefix-caching
|
||||||
--gpu-memory-utilization 0.9
|
--gpu-memory-utilization 0.9
|
||||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||||
|
|
||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
@@ -49,5 +49,5 @@ deployment:
|
|||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--no-enable-prefix-caching
|
--no-enable-prefix-caching
|
||||||
--gpu-memory-utilization 0.92
|
--gpu-memory-utilization 0.92
|
||||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
--additional-config '{"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
|
||||||
benchmarks:
|
benchmarks:
|
||||||
|
|||||||
@@ -48,27 +48,26 @@ def mtp_correctness(sampling_config: SamplingParams,
|
|||||||
if graph_mode == CUDAGraphMode.FULL:
|
if graph_mode == CUDAGraphMode.FULL:
|
||||||
graph_mode_str = "FULL_DECODE_ONLY"
|
graph_mode_str = "FULL_DECODE_ONLY"
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(model_name,
|
||||||
model_name,
|
tensor_parallel_size=1,
|
||||||
tensor_parallel_size=1,
|
max_num_seqs=256,
|
||||||
max_num_seqs=256,
|
gpu_memory_utilization=0.7,
|
||||||
gpu_memory_utilization=0.7,
|
distributed_executor_backend="mp",
|
||||||
distributed_executor_backend="mp",
|
enable_expert_parallel=True,
|
||||||
enable_expert_parallel=True,
|
speculative_config={
|
||||||
speculative_config={
|
"method":
|
||||||
"method": "deepseek_mtp",
|
"deepseek_mtp",
|
||||||
"num_speculative_tokens": num_speculative_tokens,
|
"num_speculative_tokens":
|
||||||
"disable_padded_drafter_batch": disable_padded_drafter_batch,
|
num_speculative_tokens,
|
||||||
},
|
"disable_padded_drafter_batch":
|
||||||
enforce_eager=enforce_eager,
|
disable_padded_drafter_batch,
|
||||||
max_model_len=2000,
|
},
|
||||||
compilation_config=CompilationConfig(
|
enforce_eager=enforce_eager,
|
||||||
cudagraph_mode=graph_mode_str,
|
max_model_len=2000,
|
||||||
cudagraph_capture_sizes=[12],
|
compilation_config=CompilationConfig(
|
||||||
),
|
cudagraph_mode=graph_mode_str,
|
||||||
additional_config={"ascend_scheduler_config": {
|
cudagraph_capture_sizes=[12],
|
||||||
"enabled": False
|
)) as spec_llm:
|
||||||
}}) as spec_llm:
|
|
||||||
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
|
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
|
||||||
|
|
||||||
matches = 0
|
matches = 0
|
||||||
|
|||||||
@@ -12,11 +12,6 @@ MODEL = "Qwen/Qwen3-0.6B"
|
|||||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||||
def test_concurrent_partial_prefill(enforce_eager):
|
def test_concurrent_partial_prefill(enforce_eager):
|
||||||
with VllmRunner(MODEL,
|
with VllmRunner(MODEL,
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
max_num_seqs=3,
|
max_num_seqs=3,
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
@@ -31,11 +26,6 @@ def test_concurrent_partial_prefill(enforce_eager):
|
|||||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||||
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
||||||
with VllmRunner(MODEL,
|
with VllmRunner(MODEL,
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
max_num_seqs=3,
|
max_num_seqs=3,
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
@@ -47,48 +37,6 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
|
|||||||
assert outputs[0].num_cached_tokens == 128
|
assert outputs[0].num_cached_tokens == 128
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("max_tokens",
|
|
||||||
[4]) # cannot align results when max_tokens > 4
|
|
||||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
|
||||||
def test_chunked_prefill_with_ascend_scheduler(
|
|
||||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
|
||||||
example_prompts = [
|
|
||||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
|
||||||
]
|
|
||||||
max_num_seqs = chunked_prefill_token_size
|
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
|
||||||
with VllmRunner(MODEL,
|
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
'enable_chunked_prefill': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
|
||||||
max_model_len=2048,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
chunked_prefill_output = vllm_model.generate_greedy(
|
|
||||||
example_prompts, max_tokens)
|
|
||||||
|
|
||||||
with VllmRunner(MODEL,
|
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
max_model_len=2048,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
||||||
|
|
||||||
check_outputs_equal(
|
|
||||||
outputs_0_lst=vllm_output,
|
|
||||||
outputs_1_lst=chunked_prefill_output,
|
|
||||||
name_0="vllm_output",
|
|
||||||
name_1="chunked_prefill_output",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("max_tokens",
|
@pytest.mark.parametrize("max_tokens",
|
||||||
[4]) # cannot align results when max_tokens > 4
|
[4]) # cannot align results when max_tokens > 4
|
||||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
||||||
|
|||||||
@@ -1,82 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
||||||
# Copyright 2023 The vLLM team.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
"""
|
|
||||||
Compare the outputs of vLLM with and without aclgraph.
|
|
||||||
|
|
||||||
Run `pytest tests/compile/test_aclgraph.py`.
|
|
||||||
"""
|
|
||||||
import gc
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch
|
|
||||||
from vllm import SamplingParams
|
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
|
||||||
|
|
||||||
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
@pytest.mark.parametrize("max_tokens", [1])
|
|
||||||
def test_models(
|
|
||||||
model: str,
|
|
||||||
max_tokens: int,
|
|
||||||
) -> None:
|
|
||||||
prompts = ["The president of the United States is"]
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
temperature=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
with VllmRunner(model,
|
|
||||||
long_prefill_token_threshold=20,
|
|
||||||
enforce_eager=False) as vllm_model:
|
|
||||||
output1 = vllm_model.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
with VllmRunner(model,
|
|
||||||
enforce_eager=False,
|
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True
|
|
||||||
},
|
|
||||||
}) as vllm_model:
|
|
||||||
output2 = vllm_model.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
# Extract the generated token IDs for comparison
|
|
||||||
token_ids1 = output1[0][0][0]
|
|
||||||
token_ids2 = output2[0][0][0]
|
|
||||||
|
|
||||||
print(f"Token IDs 1: {token_ids1}")
|
|
||||||
print(f"Token IDs 2: {token_ids2}")
|
|
||||||
|
|
||||||
# Convert token IDs to tensors and calculate cosine similarity
|
|
||||||
# Take the length of a shorter sequence to ensure consistent dimensions
|
|
||||||
min_len = min(len(token_ids1), len(token_ids2))
|
|
||||||
|
|
||||||
tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
|
|
||||||
tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
|
|
||||||
|
|
||||||
# Calculate similarity using torch.cosine_similarity
|
|
||||||
similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
|
|
||||||
print(f"Token IDs cosine similarity: {similarity.item()}")
|
|
||||||
|
|
||||||
assert similarity > 0.95
|
|
||||||
|
|
||||||
gc.collect()
|
|
||||||
torch.npu.empty_cache()
|
|
||||||
torch.npu.reset_peak_memory_stats()
|
|
||||||
@@ -20,7 +20,6 @@
|
|||||||
|
|
||||||
Run `pytest tests/test_offline_inference.py`.
|
Run `pytest tests/test_offline_inference.py`.
|
||||||
"""
|
"""
|
||||||
import pytest
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
@@ -55,40 +54,6 @@ def test_multimodal_vl(prompt_template):
|
|||||||
assert output_str, "Generated output should not be empty."
|
assert output_str, "Generated output should not be empty."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="This e2e test will stuck in multi-batch scenario. "
|
|
||||||
"Add this back after fixing the issue.")
|
|
||||||
def test_multimodal_ascend_scheduler(prompt_template):
|
|
||||||
image = ImageAsset("cherry_blossom") \
|
|
||||||
.pil_image.convert("RGB")
|
|
||||||
img_questions = [
|
|
||||||
"What is the content of this image?",
|
|
||||||
"Describe the content of this image in detail.",
|
|
||||||
"What's in the image?",
|
|
||||||
"Where is this image taken?",
|
|
||||||
]
|
|
||||||
images = [image] * len(img_questions)
|
|
||||||
prompts = prompt_template(img_questions)
|
|
||||||
with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
|
|
||||||
max_model_len=4096,
|
|
||||||
additional_config={
|
|
||||||
'ascend_scheduler_config': {
|
|
||||||
'enabled': True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
mm_processor_kwargs={
|
|
||||||
"min_pixels": 28 * 28,
|
|
||||||
"max_pixels": 1280 * 28 * 28,
|
|
||||||
"fps": 1,
|
|
||||||
},
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
|
||||||
images=images,
|
|
||||||
max_tokens=64)
|
|
||||||
assert len(outputs) == len(prompts)
|
|
||||||
for _, output_str in outputs:
|
|
||||||
assert output_str, "Generated output should not be empty."
|
|
||||||
|
|
||||||
|
|
||||||
def test_multimodal_audio():
|
def test_multimodal_audio():
|
||||||
audio_prompt = "".join([
|
audio_prompt = "".join([
|
||||||
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user