Revert "drop ascend scheduler" (#4580)

Reverts vllm-project/vllm-ascend#4498
- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
This commit is contained in:
Mengqing Cao
2025-11-29 22:20:48 +08:00
committed by GitHub
parent 4dbe4fd123
commit 517fd9272d
52 changed files with 2948 additions and 85 deletions

View File

@@ -48,26 +48,27 @@ def mtp_correctness(sampling_config: SamplingParams,
if graph_mode == CUDAGraphMode.FULL:
graph_mode_str = "FULL_DECODE_ONLY"
with VllmRunner(model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method":
"deepseek_mtp",
"num_speculative_tokens":
num_speculative_tokens,
"disable_padded_drafter_batch":
disable_padded_drafter_batch,
},
enforce_eager=enforce_eager,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str,
cudagraph_capture_sizes=[12],
)) as spec_llm:
with VllmRunner(
model_name,
tensor_parallel_size=1,
max_num_seqs=256,
gpu_memory_utilization=0.7,
distributed_executor_backend="mp",
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": num_speculative_tokens,
"disable_padded_drafter_batch": disable_padded_drafter_batch,
},
enforce_eager=enforce_eager,
max_model_len=2000,
compilation_config=CompilationConfig(
cudagraph_mode=graph_mode_str,
cudagraph_capture_sizes=[12],
),
additional_config={"ascend_scheduler_config": {
"enabled": False
}}) as spec_llm:
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
matches = 0