Fix different device type adjustment in PP (#7760)

This commit is contained in:
Qiaolin Yu
2025-07-15 19:37:14 -07:00
committed by GitHub
parent 7498522f7d
commit 3bc43c683e
4 changed files with 25 additions and 27 deletions

View File

@@ -699,14 +699,14 @@ class GroupCoordinator:
)
# Serialize object to tensor and get the size as well
object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8).cuda(
device=torch.cuda.current_device()
object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8).to(
device=self.device
)
size_tensor = torch.tensor(
[object_tensor.numel()],
dtype=torch.long,
device=torch.cuda.current_device(),
device=self.device,
)
# Send object size
@@ -731,9 +731,7 @@ class GroupCoordinator:
src != self.rank_in_group
), "Invalid source rank. Source rank is the same as the current rank."
size_tensor = torch.empty(
1, dtype=torch.long, device=torch.cuda.current_device()
)
size_tensor = torch.empty(1, dtype=torch.long, device=self.device)
# Receive object size
rank_size = torch.distributed.recv(
@@ -744,7 +742,7 @@ class GroupCoordinator:
object_tensor = torch.empty( # type: ignore[call-overload]
size_tensor.item(), # type: ignore[arg-type]
dtype=torch.uint8,
device=torch.cuda.current_device(),
device=self.device,
)
rank_object = torch.distributed.recv(