[BugFix] Fix num_pcp_pads Assignment Issues (#5273)
### What this PR does / why we need it?
The variable `self.num_pcp_pads` was incorrectly truncated during
assignment, causing errors in certain scenarios such as PD
disaggregated. This issue has now been resolved.
### Does this PR introduce _any_ user-facing change?
NO
### How was this patch tested?
Co-author by: QiuChunshuo <qiuchunshuo@huawei.com>
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: daishixun <dsxsteven@sina.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
@@ -135,3 +135,137 @@ def test_models_pcp_dcp_piece_wise():
|
||||
block_size=128,
|
||||
quantization="ascend") as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_pcp_basic():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
]
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=True,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
decode_context_parallel_size=1,
|
||||
max_num_batched_tokens=1024,
|
||||
enable_expert_parallel=True,
|
||||
block_size=128) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_pcp_full_graph():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
]
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
decode_context_parallel_size=1,
|
||||
max_num_batched_tokens=1024,
|
||||
enable_expert_parallel=True,
|
||||
block_size=128,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_pcp_piece_wise():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
]
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
prefill_context_parallel_size=2,
|
||||
decode_context_parallel_size=1,
|
||||
max_num_batched_tokens=1024,
|
||||
enable_expert_parallel=True,
|
||||
block_size=128) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_dcp_basic():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
]
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=True,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=4,
|
||||
prefill_context_parallel_size=1,
|
||||
decode_context_parallel_size=2,
|
||||
max_num_batched_tokens=1024,
|
||||
enable_expert_parallel=True,
|
||||
block_size=128) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_dcp_full_graph():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
]
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=4,
|
||||
prefill_context_parallel_size=1,
|
||||
decode_context_parallel_size=2,
|
||||
max_num_batched_tokens=1024,
|
||||
enable_expert_parallel=True,
|
||||
block_size=128,
|
||||
compilation_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
|
||||
}) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.skipif(vllm_version_is('0.12.0'),
|
||||
reason="0.12.0 is not supported for context sequence.")
|
||||
def test_dcp_piece_wise():
|
||||
prompts = [
|
||||
"The capital of France is", "Hello, my name is Tom, I am",
|
||||
"The president of United States is", "AI future is"
|
||||
]
|
||||
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
|
||||
with VllmRunner(model,
|
||||
enforce_eager=False,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=4,
|
||||
prefill_context_parallel_size=1,
|
||||
decode_context_parallel_size=2,
|
||||
max_num_batched_tokens=1024,
|
||||
enable_expert_parallel=True,
|
||||
block_size=128) as runner:
|
||||
runner.model.generate(prompts, sampling_params)
|
||||
|
||||
Reference in New Issue
Block a user