diff --git a/tests/e2e/multicard/test_external_launcher.py b/tests/e2e/multicard/test_external_launcher.py index d5441691..05851db1 100644 --- a/tests/e2e/multicard/test_external_launcher.py +++ b/tests/e2e/multicard/test_external_launcher.py @@ -148,8 +148,6 @@ def test_external_launcher_and_sleepmode(): print(output) - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output assert "Generated text:" in output assert "Sleep and wake up successfully!!" in output assert proc.returncode == 0 @@ -198,8 +196,6 @@ def test_external_launcher_and_sleepmode_level2(): print(output) - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output assert "Generated text:" in output assert "Sleep and wake up successfully!!" in output assert proc.returncode == 0 diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py index efa6cb39..86ce4795 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -100,8 +100,6 @@ def test_models_with_aclgraph( ) -@pytest.mark.skip("Skipping this test for now, " - "it fails intermittently and needs investigation.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [5]) def test_models_with_aclgraph_full_decode_only( @@ -172,7 +170,10 @@ def test_models_with_aclgraph_full_decode_only( model, max_model_len=1024, enforce_eager=False, - compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY"}, + compilation_config={ + "cudagraph_capture_sizes": [4, 8, 32, 64], + "cudagraph_mode": "FULL_DECODE_ONLY" + }, ) as runner: vllm_aclgraph_outputs = runner.model.generate( prompts, sampling_params) @@ -180,7 +181,9 @@ def test_models_with_aclgraph_full_decode_only( with VllmRunner( model, max_model_len=1024, - enforce_eager=True, + compilation_config={ + "cudagraph_capture_sizes": [4, 8, 32, 64], + }, ) as runner: vllm_eager_outputs = runner.model.generate(prompts, sampling_params) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 167345bf..7f8fe1e1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -108,8 +108,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import (MoECommType, set_ascend_forward_context) from vllm_ascend.attention.attention_mask import AttentionMaskBuilder -from vllm_ascend.attention.attention_v1 import (AscendAttentionMetadataBuilder, - AscendAttentionState) +from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata) # yapf: disable @@ -2887,12 +2886,12 @@ class NPUModelRunner(LoRAModelRunnerMixin): for attn_group in self.attn_groups[kv_cache_group_id]: builder = attn_group.get_metadata_builder() - if isinstance(builder, AscendAttentionMetadataBuilder): - attn_metadata_full_attention = builder.build_for_graph_capture( - common_attn_metadata, attn_state, self.get_model()) - elif isinstance(builder, GDNAttentionMetadataBuilder): + if isinstance(builder, GDNAttentionMetadataBuilder): attn_metadata_gdn_attention = builder.build_for_cudagraph_capture( common_metadata) + else: + attn_metadata_full_attention = builder.build_for_graph_capture( + common_attn_metadata, attn_state, self.get_model()) for layer_name in kv_cache_group_spec.layer_names: if "linear_attn" in layer_name: attn_metadata[