[BugFix] Fix some issues caused by the ascending order of cudagraph_capture_sizes (#4338)
### What this PR does / why we need it?
In [#26016](https://github.com/vllm-project/vllm/pull/26016), vllm
change the `cudagraph_capture_sizes` to be in ascending order. This PR
fixes related issues caused by this.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: Angazenn <supperccell@163.com>
This commit is contained in:
@@ -109,7 +109,7 @@ def _run_worker_process(
|
|||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=model_path,
|
model=model_path,
|
||||||
quantization="ascend" if "W8A8" in model_path else None,
|
quantization="ascend" if "W8A8" in model_path else None,
|
||||||
# enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
enable_expert_parallel=True if "DeepSeek" in model_path else False,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -433,7 +433,13 @@ def _is_default_capture_sizes(vllm_config: VllmConfig) -> bool:
|
|||||||
cudagraph_capture_sizes += list(
|
cudagraph_capture_sizes += list(
|
||||||
range(256, max_cudagraph_capture_size + 1, 16))
|
range(256, max_cudagraph_capture_size + 1, 16))
|
||||||
|
|
||||||
if sorted(cudagraph_capture_sizes, reverse=True) == \
|
if vllm_version_is("0.11.0"):
|
||||||
|
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes,
|
||||||
|
reverse=True)
|
||||||
|
else:
|
||||||
|
# in newer version, vVLLM use ascending order of cudagraph_capture_sizes.
|
||||||
|
target_cudagraph_capture_sizes = sorted(cudagraph_capture_sizes)
|
||||||
|
if target_cudagraph_capture_sizes == \
|
||||||
vllm_config.compilation_config.cudagraph_capture_sizes:
|
vllm_config.compilation_config.cudagraph_capture_sizes:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|||||||
@@ -664,7 +664,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
|
# tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
|
||||||
# the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
|
# the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
|
||||||
if self.compilation_config.cudagraph_capture_sizes:
|
if self.compilation_config.cudagraph_capture_sizes:
|
||||||
max_num_tokens = self.compilation_config.cudagraph_capture_sizes[0]
|
if vllm_version_is("0.11.0"):
|
||||||
|
max_num_tokens = self.compilation_config.cudagraph_capture_sizes[
|
||||||
|
0]
|
||||||
|
else:
|
||||||
|
max_num_tokens = self.compilation_config.max_cudagraph_capture_size
|
||||||
else:
|
else:
|
||||||
# NOTE: To save memory, we cap the max number of tokens to 512.
|
# NOTE: To save memory, we cap the max number of tokens to 512.
|
||||||
max_num_tokens = min(
|
max_num_tokens = min(
|
||||||
|
|||||||
Reference in New Issue
Block a user