diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index 6146960..3d0cee8 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -5,8 +5,8 @@ from unittest import mock from transformers import PretrainedConfig from vllm.config import ModelConfig, VllmConfig -from vllm_ascend.ascend_config import (check_ascend_config, - check_torchair_supported, +from vllm_ascend.ascend_config import (_check_torchair_supported, + check_ascend_config, clear_ascend_config, get_ascend_config, init_ascend_config) @@ -248,5 +248,5 @@ class TestAscendConfig(unittest.TestCase): test_cases = [('deepseek_v3', True), ('PanguProMoE', True), ('qwen', False), ('llama', False)] for model_type, expected_output in test_cases: - self.assertEqual(check_torchair_supported(model_type), + self.assertEqual(_check_torchair_supported(model_type), expected_output) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index c5c4d12..82ac32a 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -21,7 +21,7 @@ from vllm.logger import logger TORCHAIR_MODEL_LIST = ["deepseek", "pangu"] -def check_torchair_supported(model_type: str): +def _check_torchair_supported(model_type: str): for supported_model in TORCHAIR_MODEL_LIST: if supported_model in model_type.lower(): return True @@ -147,10 +147,10 @@ def check_ascend_config(vllm_config, enforce_eager): else: # torchair_graph case if ascend_config.torchair_graph_config.enabled: - # torchair_graph is supported for deepseek model only currently. + # torchair_graph is supported for deepseek/pangu model only. if vllm_config.model_config: model_type = vllm_config.model_config.hf_config.model_type - if not check_torchair_supported(model_type): + if not _check_torchair_supported(model_type): raise NotImplementedError( "Torchair graph mode only works with following model types:" f"{TORCHAIR_MODEL_LIST}.") diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py index ef810ba..46f1708 100644 --- a/vllm_ascend/attention/attention_v1_torchair.py +++ b/vllm_ascend/attention/attention_v1_torchair.py @@ -27,7 +27,6 @@ from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch -from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, nd_to_nz_2d) @@ -160,8 +159,6 @@ class AscendAttentionTorchairMetadataBuilder: def __init__(self, runner): self.runner = runner - self.torchair_graph_enabled = get_ascend_config( - ).torchair_graph_config.enabled def reorder_batch(self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput") -> bool: diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 725ebf7..c9fd8f2 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -26,8 +26,7 @@ from vllm.config import get_current_vllm_config from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import (get_dp_group, get_tp_group, - get_world_group) +from vllm.distributed.parallel_state import get_dp_group, get_tp_group from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) @@ -1119,21 +1118,12 @@ class AscendFusedMoE(FusedMoE): vllm_config = get_current_vllm_config() - if vllm_version_is("0.9.1"): - self.moe_parallel_config = FusedMoEParallelConfig.make( - tp_size_=(tp_size if tp_size is not None else - get_tensor_model_parallel_world_size()), - dp_size_=(dp_size if dp_size is not None else - get_dp_group().world_size), - vllm_parallel_config=vllm_config.parallel_config) - else: - self.moe_parallel_config = FusedMoEParallelConfig.make( - tp_size_=(tp_size if tp_size is not None else - get_tensor_model_parallel_world_size()), - dp_size_=(dp_size if dp_size is not None else - get_dp_group().world_size), - world_size_=get_world_group().world_size, - vllm_parallel_config=vllm_config.parallel_config) + self.moe_parallel_config = FusedMoEParallelConfig.make( + tp_size_=(tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()), + dp_size_=(dp_size + if dp_size is not None else get_dp_group().world_size), + vllm_parallel_config=vllm_config.parallel_config) self.top_k = top_k self.num_experts = num_experts