Remove useless env (#4858)

cleanup useless env. These envs are not used anymore

`VLLM_ASCEND_TRACE_RECOMPILES`,
`VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE`,
`VLLM_ASCEND_MLA_PA`, `PHYSICAL_DEVICES`

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-12-11 06:51:07 +08:00
committed by GitHub
parent 08441baedd
commit f917d5edcf
4 changed files with 0 additions and 28 deletions

View File

@@ -138,7 +138,6 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model):
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
prompts = [

View File

@@ -1055,7 +1055,6 @@ class MockTransferEngine:
class MockEnvsAscend:
MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
PHYSICAL_DEVICES = "10,11"
def mock_get_tensor_model_parallel_rank():

View File

@@ -893,12 +893,3 @@ class TestMooncakeLayerwiseConnectorWorker(unittest.TestCase):
worker.register_kv_caches(mla_caches)
self.assertTrue(worker.use_mla)
self.assertEqual(len(worker.block_len), 2)
def test_device_id_selection_with_physical_devices(self):
worker = MooncakeLayerwiseConnectorWorker(self.vllm_config,
self.engine_id)
self.assertIsNotNone(worker.engine)
if __name__ == '__main__':
unittest.main()

View File

@@ -68,9 +68,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
# that the correct package is installed.
"VLLM_VERSION":
lambda: os.getenv("VLLM_VERSION", None),
# Whether to enable the trace recompiles from pytorch.
"VLLM_ASCEND_TRACE_RECOMPILES":
lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
# Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and
# GroupedMatmulFinalizeRouting operators are combined to implement EP.
"VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP":
@@ -86,16 +83,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
# value to False to disable the optimized model.
"USE_OPTIMIZED_MODEL":
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
# The tolerance of the kv cache size, if the difference between the
# actual kv cache size and the cached kv cache size is less than this value,
# then the cached kv cache size will be used.
"VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
lambda: int(
os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
# Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
# and the mla_pa will be the default path of deepseek decode path.
"VLLM_ASCEND_MLA_PA":
lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)),
# Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
# this feature is supported in A2, and eager mode will get better performance.
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
@@ -130,10 +117,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
# this feature in eager mode will get better performance.
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
# Determine the number of physical devices in a non-full-use scenario
# caused by the initialization of the Mooncake connector.
"PHYSICAL_DEVICES":
lambda: os.getenv("PHYSICAL_DEVICES", None),
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
"MSMONITOR_USE_DAEMON":
lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),