fix fullgraph in ds. (#4016)
### What this PR does / why we need it?
DS don't have 'AscendAttentionMetadataBuilder' class so will fail in
fullgraph.
We resolved the issue by modifying the code to only check for
'GDNAttentionMetadataBuilder ', while all other attention cases follow
the default branch.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
@@ -148,8 +148,6 @@ def test_external_launcher_and_sleepmode():
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert "Sleep and wake up successfully!!" in output
|
||||
assert proc.returncode == 0
|
||||
@@ -198,8 +196,6 @@ def test_external_launcher_and_sleepmode_level2():
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert "Sleep and wake up successfully!!" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
@@ -100,8 +100,6 @@ def test_models_with_aclgraph(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("Skipping this test for now, "
|
||||
"it fails intermittently and needs investigation.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
def test_models_with_aclgraph_full_decode_only(
|
||||
@@ -172,7 +170,10 @@ def test_models_with_aclgraph_full_decode_only(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"},
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY"
|
||||
},
|
||||
) as runner:
|
||||
vllm_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
@@ -180,7 +181,9 @@ def test_models_with_aclgraph_full_decode_only(
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
compilation_config={
|
||||
"cudagraph_capture_sizes": [4, 8, 32, 64],
|
||||
},
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts,
|
||||
sampling_params)
|
||||
|
||||
Reference in New Issue
Block a user