diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index 529cc952..a13276bb 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -138,7 +138,6 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model): @pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS) -@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"}) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"}) def test_models_distributed_DeepSeek_W4A8DYNAMIC(model): prompts = [ diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py index 27f2e1e3..3b305e9e 100644 --- a/tests/ut/kv_connector/test_mooncake_connector.py +++ b/tests/ut/kv_connector/test_mooncake_connector.py @@ -1055,7 +1055,6 @@ class MockTransferEngine: class MockEnvsAscend: MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol" - PHYSICAL_DEVICES = "10,11" def mock_get_tensor_model_parallel_rank(): diff --git a/tests/ut/kv_connector/test_mooncake_layerwise_connector.py b/tests/ut/kv_connector/test_mooncake_layerwise_connector.py index ca4f975c..e2f84d9f 100644 --- a/tests/ut/kv_connector/test_mooncake_layerwise_connector.py +++ b/tests/ut/kv_connector/test_mooncake_layerwise_connector.py @@ -893,12 +893,3 @@ class TestMooncakeLayerwiseConnectorWorker(unittest.TestCase): worker.register_kv_caches(mla_caches) self.assertTrue(worker.use_mla) self.assertEqual(len(worker.block_len), 2) - - def test_device_id_selection_with_physical_devices(self): - worker = MooncakeLayerwiseConnectorWorker(self.vllm_config, - self.engine_id) - self.assertIsNotNone(worker.engine) - - -if __name__ == '__main__': - unittest.main() diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 8191c82d..72db1791 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -68,9 +68,6 @@ env_variables: Dict[str, Callable[[], Any]] = { # that the correct package is installed. "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None), - # Whether to enable the trace recompiles from pytorch. - "VLLM_ASCEND_TRACE_RECOMPILES": - lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))), # Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and # GroupedMatmulFinalizeRouting operators are combined to implement EP. "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": @@ -86,16 +83,6 @@ env_variables: Dict[str, Callable[[], Any]] = { # value to False to disable the optimized model. "USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))), - # The tolerance of the kv cache size, if the difference between the - # actual kv cache size and the cached kv cache size is less than this value, - # then the cached kv cache size will be used. - "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE": - lambda: int( - os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)), - # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible - # and the mla_pa will be the default path of deepseek decode path. - "VLLM_ASCEND_MLA_PA": - lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)), # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled. # this feature is supported in A2, and eager mode will get better performance. "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": @@ -130,10 +117,6 @@ env_variables: Dict[str, Callable[[], Any]] = { # this feature in eager mode will get better performance. "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))), - # Determine the number of physical devices in a non-full-use scenario - # caused by the initialization of the Mooncake connector. - "PHYSICAL_DEVICES": - lambda: os.getenv("PHYSICAL_DEVICES", None), # Whether to enable msMonitor tool to monitor the performance of vllm-ascend. "MSMONITOR_USE_DAEMON": lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),