fix qwen3moe eplb prefill bug (#6617)
This commit is contained in:
@@ -136,7 +136,7 @@ class ExpertLocationMetadata:
|
|||||||
num_physical_experts = common["num_physical_experts"]
|
num_physical_experts = common["num_physical_experts"]
|
||||||
|
|
||||||
phase = server_args.disaggregation_mode
|
phase = server_args.disaggregation_mode
|
||||||
if phase == "null":
|
if phase == "null" or model_config_for_expert_location.num_groups is None:
|
||||||
phase = "decode"
|
phase = "decode"
|
||||||
|
|
||||||
physical_to_logical_map, logical_to_all_physical_map, expert_count = (
|
physical_to_logical_map, logical_to_all_physical_map, expert_count = (
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
import logging
|
import logging
|
||||||
|
from datetime import timedelta
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -340,7 +341,7 @@ def update_expert_weights_single_layer(
|
|||||||
reqs = torch.distributed.batch_isend_irecv(p2p_ops)
|
reqs = torch.distributed.batch_isend_irecv(p2p_ops)
|
||||||
try:
|
try:
|
||||||
for req in reqs:
|
for req in reqs:
|
||||||
req.wait(timeout=30)
|
req.wait(timeout=timedelta(seconds=30))
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}"
|
f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}"
|
||||||
|
|||||||
Reference in New Issue
Block a user