[fix] remove cuda_device_count_stateless (#5060)

This commit is contained in:
JieXin Liang
2025-04-04 15:18:26 +08:00
committed by GitHub
parent 31035dda44
commit a995a773a0
3 changed files with 5 additions and 46 deletions

View File

@@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
gpu_p2p_access_check,
)
from sglang.srt.distributed.parallel_state import in_the_same_node_as
from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
from sglang.srt.utils import is_cuda, is_hip
logger = logging.getLogger(__name__)
@@ -217,7 +217,7 @@ class CustomAllreduce:
if cuda_visible_devices:
device_ids = list(map(int, cuda_visible_devices.split(",")))
else:
device_ids = list(range(cuda_device_count_stateless()))
device_ids = list(range(torch.cuda.device_count()))
physical_device_id = device_ids[device.index]
tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")

View File

@@ -11,11 +11,11 @@ import tempfile
from itertools import product
from typing import Dict, List, Optional, Sequence
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
from sglang.srt.utils import cuda_device_count_stateless
logger = logging.getLogger(__name__)
@@ -218,7 +218,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
is_distributed = dist.is_initialized()
num_dev = cuda_device_count_stateless()
num_dev = torch.cuda.device_count()
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if cuda_visible_devices is None:
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))