fix: force synchronization between TP workers when update_weights (#6626)
Co-authored-by: dangkai.dk <dangkai.dk@alibaba-inc.com>
This commit is contained in:
@@ -2235,6 +2235,7 @@ class Scheduler(
|
|||||||
assert flash_cache_success, "Cache flush failed after updating weights"
|
assert flash_cache_success, "Cache flush failed after updating weights"
|
||||||
else:
|
else:
|
||||||
logger.error(message)
|
logger.error(message)
|
||||||
|
barrier(group=self.tp_cpu_group)
|
||||||
return UpdateWeightsFromTensorReqOutput(success, message)
|
return UpdateWeightsFromTensorReqOutput(success, message)
|
||||||
|
|
||||||
def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
|
def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
|
||||||
|
|||||||
Reference in New Issue
Block a user