[CI]Fix oom of deepseek-eplb nigtly test. (#3884)
### What this PR does / why we need it?
Fix oom of deepseek-eplb nigtly test
- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19
---------
Signed-off-by: offline0806 <3337230449@qq.com>
Co-authored-by: offline0806 <3337230449@qq.com>
This commit is contained in:
@@ -85,7 +85,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
|||||||
"--quantization", "ascend", "--gpu-memory-utilization", "0.9",
|
"--quantization", "ascend", "--gpu-memory-utilization", "0.9",
|
||||||
"--additional-config", '{"enable_weight_nz_layout":true, '
|
"--additional-config", '{"enable_weight_nz_layout":true, '
|
||||||
'"torch_air_graph_config":{"enabled": true, "enable_multistream_mla": true, "graph_batch_size": [16], "use_cached_graph": true},'
|
'"torch_air_graph_config":{"enabled": true, "enable_multistream_mla": true, "graph_batch_size": [16], "use_cached_graph": true},'
|
||||||
'"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, "init_redundancy_expert": 16}'
|
'"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200'
|
||||||
]
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
|
|||||||
@@ -82,8 +82,7 @@ async def test_models(model: str, tp_size: int) -> None:
|
|||||||
"--quantization", "ascend", "--gpu-memory-utilization", "0.9",
|
"--quantization", "ascend", "--gpu-memory-utilization", "0.9",
|
||||||
"--additional-config",
|
"--additional-config",
|
||||||
'{"enable_weight_nz_layout":true, "dynamic_eplb": true, '
|
'{"enable_weight_nz_layout":true, "dynamic_eplb": true, '
|
||||||
'"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, '
|
'"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200}'
|
||||||
'"init_redundancy_expert": 16}'
|
|
||||||
]
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ class D2DExpertWeightLoader:
|
|||||||
local_expert_to_replace,
|
local_expert_to_replace,
|
||||||
buffer_tensor_id)
|
buffer_tensor_id)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"[EPLB] finished update expert weight for layer: {self.layer_id}")
|
f"[EPLB] finished update expert weight for layer: {self.layer_id}")
|
||||||
|
|
||||||
self.recv_expert_list = []
|
self.recv_expert_list = []
|
||||||
|
|||||||
@@ -77,6 +77,7 @@ class EplbUpdator:
|
|||||||
self.cur_iterations += 1
|
self.cur_iterations += 1
|
||||||
if self.cur_iterations == (self.num_iterations_eplb_update + \
|
if self.cur_iterations == (self.num_iterations_eplb_update + \
|
||||||
self.num_wait_worker_iterations + self.num_moe_layers):
|
self.num_wait_worker_iterations + self.num_moe_layers):
|
||||||
|
logger.info("Finish expert parallel load balancing.")
|
||||||
if self.expert_map_record_path is not None:
|
if self.expert_map_record_path is not None:
|
||||||
self.adaptor._export_tensor_to_file(
|
self.adaptor._export_tensor_to_file(
|
||||||
self.shared_dict["expert_maps"],
|
self.shared_dict["expert_maps"],
|
||||||
|
|||||||
Reference in New Issue
Block a user