fix: force synchronization between TP workers when update_weights (#6626)

Co-authored-by: dangkai.dk <dangkai.dk@alibaba-inc.com>
This commit is contained in:
DangKai
2025-06-25 16:35:59 +08:00
committed by GitHub
parent 3abc30364d
commit bc2e5645c4

View File

@@ -2235,6 +2235,7 @@ class Scheduler(
assert flash_cache_success, "Cache flush failed after updating weights"
else:
logger.error(message)
barrier(group=self.tp_cpu_group)
return UpdateWeightsFromTensorReqOutput(success, message)
def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):