Remove useless env (#4858)
cleanup useless env. These envs are not used anymore
`VLLM_ASCEND_TRACE_RECOMPILES`,
`VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE`,
`VLLM_ASCEND_MLA_PA`, `PHYSICAL_DEVICES`
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -138,7 +138,6 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
|
||||
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
prompts = [
|
||||
|
||||
@@ -1055,7 +1055,6 @@ class MockTransferEngine:
|
||||
|
||||
class MockEnvsAscend:
|
||||
MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
|
||||
PHYSICAL_DEVICES = "10,11"
|
||||
|
||||
|
||||
def mock_get_tensor_model_parallel_rank():
|
||||
|
||||
@@ -893,12 +893,3 @@ class TestMooncakeLayerwiseConnectorWorker(unittest.TestCase):
|
||||
worker.register_kv_caches(mla_caches)
|
||||
self.assertTrue(worker.use_mla)
|
||||
self.assertEqual(len(worker.block_len), 2)
|
||||
|
||||
def test_device_id_selection_with_physical_devices(self):
|
||||
worker = MooncakeLayerwiseConnectorWorker(self.vllm_config,
|
||||
self.engine_id)
|
||||
self.assertIsNotNone(worker.engine)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -68,9 +68,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# that the correct package is installed.
|
||||
"VLLM_VERSION":
|
||||
lambda: os.getenv("VLLM_VERSION", None),
|
||||
# Whether to enable the trace recompiles from pytorch.
|
||||
"VLLM_ASCEND_TRACE_RECOMPILES":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
|
||||
# Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and
|
||||
# GroupedMatmulFinalizeRouting operators are combined to implement EP.
|
||||
"VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP":
|
||||
@@ -86,16 +83,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# value to False to disable the optimized model.
|
||||
"USE_OPTIMIZED_MODEL":
|
||||
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
|
||||
# The tolerance of the kv cache size, if the difference between the
|
||||
# actual kv cache size and the cached kv cache size is less than this value,
|
||||
# then the cached kv cache size will be used.
|
||||
"VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
|
||||
lambda: int(
|
||||
os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
|
||||
# Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
|
||||
# and the mla_pa will be the default path of deepseek decode path.
|
||||
"VLLM_ASCEND_MLA_PA":
|
||||
lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)),
|
||||
# Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
|
||||
# this feature is supported in A2, and eager mode will get better performance.
|
||||
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
|
||||
@@ -130,10 +117,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# this feature in eager mode will get better performance.
|
||||
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
|
||||
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
|
||||
# Determine the number of physical devices in a non-full-use scenario
|
||||
# caused by the initialization of the Mooncake connector.
|
||||
"PHYSICAL_DEVICES":
|
||||
lambda: os.getenv("PHYSICAL_DEVICES", None),
|
||||
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
|
||||
"MSMONITOR_USE_DAEMON":
|
||||
lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),
|
||||
|
||||
Reference in New Issue
Block a user