fix: force synchronization between TP workers when update_weights (#6626)
Co-authored-by: dangkai.dk <dangkai.dk@alibaba-inc.com>
This commit is contained in:
@@ -2235,6 +2235,7 @@ class Scheduler(
|
||||
assert flash_cache_success, "Cache flush failed after updating weights"
|
||||
else:
|
||||
logger.error(message)
|
||||
barrier(group=self.tp_cpu_group)
|
||||
return UpdateWeightsFromTensorReqOutput(success, message)
|
||||
|
||||
def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
|
||||
|
||||
Reference in New Issue
Block a user