[Auto Sync] Update parallel_state.py, few_shot_gsm8k.py (20250903) (#9986)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Leon Gao <leon.gao19@gmail.com>
This commit is contained in:
@@ -879,17 +879,16 @@ class GroupCoordinator:
|
|||||||
size_tensor = torch.tensor(
|
size_tensor = torch.tensor(
|
||||||
[object_tensor.numel()],
|
[object_tensor.numel()],
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device=torch.cuda.current_device(),
|
device="cpu",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Send object size
|
# Send object size
|
||||||
torch.distributed.send(
|
torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)
|
||||||
size_tensor, dst=self.ranks[dst], group=self.device_group
|
|
||||||
)
|
|
||||||
|
|
||||||
# Send object
|
# Send object
|
||||||
torch.distributed.send(
|
torch.distributed.send(
|
||||||
object_tensor, dst=self.ranks[dst], group=self.device_group
|
object_tensor,
|
||||||
|
dst=self.ranks[dst],
|
||||||
|
group=self.device_group,
|
||||||
)
|
)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@@ -904,13 +903,11 @@ class GroupCoordinator:
|
|||||||
src != self.rank_in_group
|
src != self.rank_in_group
|
||||||
), "Invalid source rank. Source rank is the same as the current rank."
|
), "Invalid source rank. Source rank is the same as the current rank."
|
||||||
|
|
||||||
size_tensor = torch.empty(
|
size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
|
||||||
1, dtype=torch.long, device=torch.cuda.current_device()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Receive object size
|
# Receive object size
|
||||||
rank_size = torch.distributed.recv(
|
rank_size = torch.distributed.recv(
|
||||||
size_tensor, src=self.ranks[src], group=self.device_group
|
size_tensor, src=self.ranks[src], group=self.cpu_group
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tensor to receive serialized objects into.
|
# Tensor to receive serialized objects into.
|
||||||
@@ -928,7 +925,7 @@ class GroupCoordinator:
|
|||||||
rank_object == rank_size
|
rank_object == rank_size
|
||||||
), "Received object sender rank does not match the size sender rank."
|
), "Received object sender rank does not match the size sender rank."
|
||||||
|
|
||||||
obj = pickle.loads(object_tensor.cpu().numpy().tobytes())
|
obj = pickle.loads(object_tensor.cpu().numpy())
|
||||||
|
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|||||||
@@ -129,6 +129,7 @@ def run_eval(args):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"accuracy": acc,
|
"accuracy": acc,
|
||||||
|
"invalid": invalid,
|
||||||
"latency": latency,
|
"latency": latency,
|
||||||
"output_throughput": output_throughput,
|
"output_throughput": output_throughput,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user