forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/distributed/__init__.py
Normal file
0
vllm-v0.6.2/tests/distributed/__init__.py
Normal file
59
vllm-v0.6.2/tests/distributed/test_ca_buffer_sharing.py
Normal file
59
vllm-v0.6.2/tests/distributed/test_ca_buffer_sharing.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# can only run on machines with p2p access across GPUs
|
||||
# can only run with torchrun:
|
||||
# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
|
||||
|
||||
import ctypes
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa
|
||||
CustomAllreduce)
|
||||
|
||||
# create a cpu process group for communicating metadata (ipc handle)
|
||||
dist.init_process_group(backend="gloo")
|
||||
rank = local_rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
|
||||
# every process sets its own device (differently)
|
||||
lib = CudaRTLibrary()
|
||||
lib.cudaSetDevice(rank)
|
||||
|
||||
buffer_size_in_bytes = 1024
|
||||
byte_value = 2 # the value we write to the buffer for verification
|
||||
|
||||
pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
|
||||
|
||||
print(f"Rank {rank} has pointers {pointers}")
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
if rank == 0:
|
||||
# the first rank tries to write to all buffers
|
||||
for p in pointers:
|
||||
pointer = ctypes.c_void_p(p)
|
||||
lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
host_data = (ctypes.c_char * buffer_size_in_bytes)()
|
||||
|
||||
# all ranks read from all buffers, and check if the data is correct
|
||||
for p in pointers:
|
||||
pointer = ctypes.c_void_p(p)
|
||||
lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
|
||||
for i in range(buffer_size_in_bytes):
|
||||
assert ord(host_data[i]) == byte_value, (
|
||||
f"Rank {rank} failed"
|
||||
f" to verify buffer {p}. Expected {byte_value}, "
|
||||
f"got {ord(host_data[i])}")
|
||||
|
||||
print(f"Rank {rank} verified all buffers")
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
CustomAllreduce.free_shared_buffer(pointers)
|
||||
200
vllm-v0.6.2/tests/distributed/test_comm_ops.py
Normal file
200
vllm-v0.6.2/tests/distributed/test_comm_ops.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Test the communication operators.
|
||||
|
||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce)
|
||||
|
||||
from ..utils import init_test_distributed_environment, multi_process_parallel
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
|
||||
(r + 1) for r in range(tp_size)
|
||||
]
|
||||
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
t = all_tensors[rank % tp_size]
|
||||
t = tensor_model_parallel_all_reduce(t)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
num_dimensions = 3
|
||||
tensor_size = list(range(2, num_dimensions + 2))
|
||||
total_size = 1
|
||||
for s in tensor_size:
|
||||
total_size *= s
|
||||
for all_gather_dimension in range(num_dimensions):
|
||||
all_tensors = [
|
||||
torch.arange(total_size, dtype=torch.float32,
|
||||
device="cuda").reshape(tensor_size) * (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
expected = torch.cat(all_tensors, dim=all_gather_dimension)
|
||||
t = all_tensors[rank % tp_size]
|
||||
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
# CPU tensor
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
|
||||
if (rank % tp_size) == 0:
|
||||
broadcast_tensor_dict(test_dict, src=0)
|
||||
else:
|
||||
recv_dict = broadcast_tensor_dict(src=0)
|
||||
assert len(recv_dict) == len(test_dict)
|
||||
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
|
||||
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
|
||||
assert recv_dict["c"] == test_dict["c"]
|
||||
assert recv_dict["d"] == test_dict["d"]
|
||||
assert recv_dict["e"] == test_dict["e"]
|
||||
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
# CPU tensor
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
recv_dict = get_pp_group().recv_tensor_dict()
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
get_pp_group().send_tensor_dict(test_dict)
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
assert len(recv_dict) == len(test_dict)
|
||||
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
|
||||
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
|
||||
assert recv_dict["c"] == test_dict["c"]
|
||||
assert recv_dict["d"] == test_dict["d"]
|
||||
assert recv_dict["e"] == test_dict["e"]
|
||||
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
size = 64
|
||||
test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
get_pp_group().send(test_tensor)
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
torch.testing.assert_close(test_tensor, recv_tensor)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel(tp_size, test_target):
|
||||
multi_process_parallel(tp_size, 1, test_target)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
||||
def test_multi_process_pipeline_parallel(pp_size, test_target):
|
||||
multi_process_parallel(1, pp_size, test_target)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
send_recv_test_worker, send_recv_tensor_dict_test_worker,
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel_pipeline_parallel(
|
||||
tp_size, pp_size, test_target):
|
||||
multi_process_parallel(tp_size, pp_size, test_target)
|
||||
115
vllm-v0.6.2/tests/distributed/test_custom_all_reduce.py
Normal file
115
vllm-v0.6.2/tests/distributed/test_custom_all_reduce.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
|
||||
get_tp_group, graph_capture)
|
||||
|
||||
from ..utils import (ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment, multi_process_parallel)
|
||||
|
||||
random.seed(42)
|
||||
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
|
||||
for i, v in enumerate(test_sizes):
|
||||
test_sizes[i] -= v % 8
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tensor_model_parallel_group().device_group
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
# this is needed because device communicators might be created lazily
|
||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||
# before any communication happens, so that this group can be used for
|
||||
# graph capture immediately.
|
||||
data = torch.zeros(1)
|
||||
data = data.to(device=device)
|
||||
torch.distributed.all_reduce(data, group=group)
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with graph_capture() as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph,
|
||||
stream=graph_capture_context.stream):
|
||||
for i in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
# synchronization
|
||||
dist.all_reduce(inp1, group=group)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2, group=group)
|
||||
graph.replay()
|
||||
torch.testing.assert_close(out1, inp1)
|
||||
torch.testing.assert_close(out2, inp2)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
sz = 1024
|
||||
fa = get_tp_group().ca_comm
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
||||
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
|
||||
6
vllm-v0.6.2/tests/distributed/test_distributed_oot.py
Normal file
6
vllm-v0.6.2/tests/distributed/test_distributed_oot.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from ..entrypoints.openai.test_oot_registration import (
|
||||
run_and_test_dummy_opt_api_server)
|
||||
|
||||
|
||||
def test_distributed_oot(dummy_opt_path: str):
|
||||
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
|
||||
64
vllm-v0.6.2/tests/distributed/test_multi_node_assignment.py
Normal file
64
vllm-v0.6.2/tests/distributed/test_multi_node_assignment.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Make sure ray assigns GPU workers to the correct node.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_multi_node_assignment.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
from vllm import initialize_ray_cluster
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.executor.ray_utils import _wait_until_pg_removed
|
||||
from vllm.utils import get_ip
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not VLLM_MULTI_NODE,
|
||||
reason="Need at least 2 nodes to run the test.")
|
||||
def test_multi_node_assignment() -> None:
|
||||
|
||||
# NOTE: important to keep this class definition here
|
||||
# to let ray use cloudpickle to serialize it.
|
||||
class Actor:
|
||||
|
||||
def get_ip(self):
|
||||
return get_ip()
|
||||
|
||||
for _ in range(10):
|
||||
config = ParallelConfig(1, 2)
|
||||
initialize_ray_cluster(config)
|
||||
|
||||
current_ip = get_ip()
|
||||
workers = []
|
||||
for bundle_id, bundle in enumerate(
|
||||
config.placement_group.bundle_specs):
|
||||
if not bundle.get("GPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=config.placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=1,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
)(Actor).remote()
|
||||
worker_ip = ray.get(worker.get_ip.remote())
|
||||
assert worker_ip == current_ip
|
||||
workers.append(worker)
|
||||
|
||||
for worker in workers:
|
||||
ray.kill(worker)
|
||||
|
||||
_wait_until_pg_removed(config.placement_group)
|
||||
414
vllm-v0.6.2/tests/distributed/test_pipeline_parallel.py
Normal file
414
vllm-v0.6.2/tests/distributed/test_pipeline_parallel.py
Normal file
@@ -0,0 +1,414 @@
|
||||
"""
|
||||
WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
|
||||
important to set the distributed backend to "mp" to avoid Ray scheduling
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_pipeline_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class PPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
trust_remote_code: bool
|
||||
tokenizer_mode: Optional[str]
|
||||
load_format: Optional[str] = None
|
||||
hf_overrides: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PPTestSettings:
|
||||
parallel_setups: List[ParallelSetup]
|
||||
distributed_backends: List[str]
|
||||
task: TaskOption
|
||||
test_options: PPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
multi_node_only: bool = False,
|
||||
task: TaskOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
task=task,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
task=task,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
)
|
||||
|
||||
def iter_params(self, model_name: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
self.task, opts)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
# yapf: disable
|
||||
TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
# Uses Llama
|
||||
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
|
||||
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
|
||||
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
||||
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
|
||||
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
|
||||
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
|
||||
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
|
||||
"tiiuae/falcon-7b": PPTestSettings.fast(),
|
||||
"google/gemma-2b": PPTestSettings.fast(),
|
||||
"google/gemma-2-9b": PPTestSettings.fast(),
|
||||
"gpt2": PPTestSettings.fast(),
|
||||
"bigcode/starcoder": PPTestSettings.fast(),
|
||||
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
|
||||
"EleutherAI/pythia-12b": PPTestSettings.fast(),
|
||||
"ibm/PowerLM-3b": PPTestSettings.fast(),
|
||||
"ibm/PowerMoE-3b": PPTestSettings.fast(),
|
||||
# Uses Llama
|
||||
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
|
||||
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
||||
# TODO: Implement PP
|
||||
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
|
||||
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
|
||||
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
|
||||
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
|
||||
# Uses Llama
|
||||
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
|
||||
"mosaicml/mpt-7b": PPTestSettings.fast(),
|
||||
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
|
||||
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
|
||||
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
|
||||
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
|
||||
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
"adept/persimmon-8b-chat": PPTestSettings.fast(),
|
||||
"microsoft/phi-2": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501
|
||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
"Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
|
||||
"bigcode/starcoder2-3b": PPTestSettings.fast(),
|
||||
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
|
||||
# FIXME: Cannot load tokenizer in latest transformers version.
|
||||
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
|
||||
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
# [Encoder-only]
|
||||
# TODO: Implement PP
|
||||
# "facebook/bart-base": PPTestSettings.fast(),
|
||||
}
|
||||
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
|
||||
}
|
||||
|
||||
MULTIMODAL_MODELS = {
|
||||
# [Decoder-only]
|
||||
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
|
||||
"facebook/chameleon-7b": PPTestSettings.fast(),
|
||||
"adept/fuyu-8b": PPTestSettings.fast(),
|
||||
"THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
|
||||
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
|
||||
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
|
||||
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
|
||||
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
|
||||
# [Encoder-decoder]
|
||||
# TODO: Implement PP
|
||||
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
# NOTE: You can update this on your local machine to run specific tests
|
||||
TEST_MODELS = [
|
||||
# [LANGUAGE GENERATION]
|
||||
# "microsoft/Phi-3.5-MoE-instruct",
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
# "ibm/PowerLM-3b",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
# "intfloat/e5-mistral-7b-instruct",
|
||||
# "BAAI/bge-multilingual-gemma2",
|
||||
# [MULTIMODAL GENERATION]
|
||||
# "OpenGVLab/InternVL2-1B",
|
||||
# "microsoft/Phi-3-vision-128k-instruct",
|
||||
# "fixie-ai/ultravox-v0_3",
|
||||
]
|
||||
|
||||
|
||||
def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate", "encode"],
|
||||
):
|
||||
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
|
||||
multi_node_only, trust_remote_code, tokenizer_mode, \
|
||||
load_format, hf_overrides = test_options
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", hf_overrides])
|
||||
|
||||
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
|
||||
and chunked_prefill):
|
||||
# Test Ray ADAG for a subset of the tests
|
||||
pp_env = {
|
||||
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
}
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of aDAG issue.
|
||||
common_args.append("--disable-frontend-multiprocessing")
|
||||
else:
|
||||
pp_env = None
|
||||
|
||||
pp_args = [
|
||||
*common_args,
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
# compare without pipeline parallelism
|
||||
# NOTE: use mp backend for TP
|
||||
# PP tests might involve multiple nodes, and ray might
|
||||
# schedule all workers in a node other than the head node,
|
||||
# which can cause the test to fail.
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_name,
|
||||
pp_args,
|
||||
tp_args,
|
||||
pp_env,
|
||||
method=method)
|
||||
except Exception:
|
||||
if pp_env is None:
|
||||
raise
|
||||
else:
|
||||
# Ray ADAG tests are flaky, so we don't want to fail the test
|
||||
logger.exception("Ray ADAG tests failed")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_language_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_language_embedding(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="encode")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_multimodal_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
34
vllm-v0.6.2/tests/distributed/test_pipeline_partition.py
Normal file
34
vllm-v0.6.2/tests/distributed/test_pipeline_partition.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.distributed.utils import get_pp_indices
|
||||
|
||||
|
||||
def test_custom_layer_partition():
|
||||
|
||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
||||
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
|
||||
for pp_rank, golden in enumerate(goldens):
|
||||
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
||||
if bak is not None:
|
||||
os.environ["VLLM_PP_LAYER_PARTITION"] = bak
|
||||
|
||||
# Even partition
|
||||
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Balanced partition
|
||||
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
||||
# Put reminder somewhere
|
||||
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
||||
# Invalid partition strings
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of partitions
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of layers
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
30
vllm-v0.6.2/tests/distributed/test_pp_cudagraph.py
Normal file
30
vllm-v0.6.2/tests/distributed/test_pp_cudagraph.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
|
||||
(2, "JackFram/llama-160m"),
|
||||
])
|
||||
@pytest.mark.parametrize("ATTN_BACKEND", [
|
||||
"FLASH_ATTN",
|
||||
"FLASHINFER",
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
|
||||
cudagraph_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--pipeline-parallel-size",
|
||||
str(PP_SIZE),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
|
||||
|
||||
eager_args = cudagraph_args + ["--enforce-eager"]
|
||||
|
||||
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
||||
241
vllm-v0.6.2/tests/distributed/test_pynccl.py
Normal file
241
vllm-v0.6.2/tests/distributed/test_pynccl.py
Normal file
@@ -0,0 +1,241 @@
|
||||
import multiprocessing
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
|
||||
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
|
||||
get_world_group, graph_capture,
|
||||
init_distributed_environment)
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes: List[multiprocessing.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: Dict[str, str] = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def worker_fn_wrapper(fn):
|
||||
# `multiprocessing.Process` cannot accept environment variables directly
|
||||
# so we need to pass the environment variables as arguments
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ['LOCAL_RANK']
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
fn()
|
||||
|
||||
return wrapped_fn
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
tensor = torch.ones(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == pynccl_comm.world_size
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
def test_pynccl():
|
||||
distributed_run(worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_allreduce_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo")
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
# two groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
else:
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_allreduce():
|
||||
# this tests pynccl for multiple tp groups, in a standalone way
|
||||
# i.e. call `pynccl_comm.all_reduce` directly
|
||||
distributed_run(multiple_allreduce_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_allreduce_with_vllm_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
ensure_model_parallel_initialized(2, 2)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with graph_capture():
|
||||
# two tp groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
else:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_allreduce_with_vllm():
|
||||
# this tests pynccl for multiple tp groups, together with vllm
|
||||
# i.e. call `tensor_model_parallel_all_reduce`
|
||||
distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn_with_cudagraph():
|
||||
with torch.no_grad():
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
# run something in the default stream to initialize torch engine
|
||||
a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
|
||||
torch.cuda.synchronize()
|
||||
with torch.cuda.graph(
|
||||
graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
|
||||
enable=True):
|
||||
# operation during the graph capture is recorded but not executed
|
||||
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
|
||||
pynccl_comm.all_reduce(a)
|
||||
pynccl_comm.stream.synchronize()
|
||||
assert a.mean().cpu().item() == pynccl_comm.world_size**0
|
||||
graph.replay()
|
||||
pynccl_comm.stream.synchronize()
|
||||
assert a.mean().cpu().item() == pynccl_comm.world_size**1
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
def test_pynccl_with_cudagraph():
|
||||
distributed_run(worker_fn_with_cudagraph, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def send_recv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
if pynccl_comm.rank == 0:
|
||||
tensor = torch.ones(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
else:
|
||||
tensor = torch.empty(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
if pynccl_comm.rank == 0:
|
||||
pynccl_comm.send(tensor,
|
||||
dst=(pynccl_comm.rank + 1) %
|
||||
pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor,
|
||||
src=(pynccl_comm.rank - 1) %
|
||||
pynccl_comm.world_size)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
def test_pynccl_send_recv():
|
||||
distributed_run(send_recv_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_send_recv_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[1, 3], backend="gloo")
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
elif torch.distributed.get_rank() == 1:
|
||||
tensor = 2 * torch.ones(
|
||||
16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
else:
|
||||
tensor = torch.empty(16,
|
||||
1024,
|
||||
1024,
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
pynccl_comm.send(tensor,
|
||||
dst=(pynccl_comm.rank + 1) %
|
||||
pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor,
|
||||
src=(pynccl_comm.rank - 1) %
|
||||
pynccl_comm.world_size)
|
||||
result = tensor.mean().cpu().item()
|
||||
if torch.distributed.get_rank() in [0, 2]:
|
||||
assert result == 1
|
||||
else:
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_send_recv():
|
||||
distributed_run(multiple_send_recv_worker_fn, 4)
|
||||
|
||||
|
||||
def test_ncclGetUniqueId():
|
||||
lib = NCCLLibrary()
|
||||
unique_id = lib.ncclGetUniqueId()
|
||||
# `list(unique_id.internal)` is something like this:
|
||||
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
# as long as the function doesn't raise an exception, we're good
|
||||
assert unique_id is not None
|
||||
13
vllm-v0.6.2/tests/distributed/test_same_node.py
Normal file
13
vllm-v0.6.2/tests/distributed/test_same_node.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import os
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
print("Same node test passed!")
|
||||
88
vllm-v0.6.2/tests/distributed/test_shm_broadcast.py
Normal file
88
vllm-v0.6.2/tests/distributed/test_shm_broadcast.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import multiprocessing
|
||||
import random
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
|
||||
np.random.seed(seed)
|
||||
sizes = np.random.randint(1, 10_000, n)
|
||||
# on average, each array will have 5k elements
|
||||
# with int64, each array will have 40kb
|
||||
return [np.random.randint(1, 100, i) for i in sizes]
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes = []
|
||||
for i in range(number_of_processes):
|
||||
env = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def worker_fn_wrapper(fn):
|
||||
# `multiprocessing.Process` cannot accept environment variables directly
|
||||
# so we need to pass the environment variables as arguments
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
dist.init_process_group(backend="gloo")
|
||||
fn()
|
||||
|
||||
return wrapped_fn
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
writer_rank = 2
|
||||
broadcaster = MessageQueue.create_from_process_group(
|
||||
dist.group.WORLD, 40 * 1024, 2, writer_rank)
|
||||
if dist.get_rank() == writer_rank:
|
||||
seed = random.randint(0, 1000)
|
||||
dist.broadcast_object_list([seed], writer_rank)
|
||||
else:
|
||||
recv = [None]
|
||||
dist.broadcast_object_list(recv, writer_rank)
|
||||
seed = recv[0] # type: ignore
|
||||
dist.barrier()
|
||||
# in case we find a race condition
|
||||
# print the seed so that we can reproduce the error
|
||||
print(f"Rank {dist.get_rank()} got seed {seed}")
|
||||
# test broadcasting with about 400MB of data
|
||||
N = 10_000
|
||||
if dist.get_rank() == writer_rank:
|
||||
arrs = get_arrays(N, seed)
|
||||
for x in arrs:
|
||||
broadcaster.broadcast_object(x)
|
||||
time.sleep(random.random() / 1000)
|
||||
else:
|
||||
arrs = get_arrays(N, seed)
|
||||
for x in arrs:
|
||||
y = broadcaster.broadcast_object(None)
|
||||
assert np.array_equal(x, y)
|
||||
time.sleep(random.random() / 1000)
|
||||
dist.barrier()
|
||||
|
||||
|
||||
def test_shm_broadcast():
|
||||
distributed_run(worker_fn, 4)
|
||||
143
vllm-v0.6.2/tests/distributed/test_utils.py
Normal file
143
vllm-v0.6.2/tests/distributed/test_utils.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import (mlu_device_count_stateless, get_open_port,
|
||||
update_environment_variables)
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
|
||||
@ray.remote
|
||||
class _CUDADeviceCountStatelessTestActor:
|
||||
|
||||
def get_count(self):
|
||||
return mlu_device_count_stateless()
|
||||
|
||||
def set_cuda_visible_devices(self, cuda_visible_devices: str):
|
||||
update_environment_variables(
|
||||
{"MLU_VISIBLE_DEVICES": cuda_visible_devices})
|
||||
|
||||
def get_cuda_visible_devices(self):
|
||||
return envs.MLU_VISIBLE_DEVICES
|
||||
|
||||
|
||||
def test_cuda_device_count_stateless():
|
||||
"""Test that cuda_device_count_stateless changes return value if
|
||||
CUDA_VISIBLE_DEVICES is changed."""
|
||||
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
|
||||
num_gpus=2).remote()
|
||||
assert len(
|
||||
sorted(ray.get(
|
||||
actor.get_cuda_visible_devices.remote()).split(","))) == 2
|
||||
assert ray.get(actor.get_count.remote()) == 2
|
||||
ray.get(actor.set_cuda_visible_devices.remote("0"))
|
||||
assert ray.get(actor.get_count.remote()) == 1
|
||||
ray.get(actor.set_cuda_visible_devices.remote(""))
|
||||
assert ray.get(actor.get_count.remote()) == 0
|
||||
|
||||
|
||||
def cpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port2,
|
||||
rank=rank,
|
||||
world_size=3)
|
||||
data = torch.tensor([rank])
|
||||
data = pg1.broadcast_obj(data, src=2)
|
||||
assert data.item() == 2
|
||||
if rank <= 2:
|
||||
data = torch.tensor([rank + 1])
|
||||
data = pg2.broadcast_obj(data, src=2)
|
||||
assert data.item() == 3
|
||||
pg2.barrier()
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
def gpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
torch.cuda.set_device(rank)
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
pynccl1 = PyNcclCommunicator(pg1, device=rank)
|
||||
pynccl1.disabled = False
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port2,
|
||||
rank=rank,
|
||||
world_size=3)
|
||||
pynccl2 = PyNcclCommunicator(pg2, device=rank)
|
||||
pynccl2.disabled = False
|
||||
data = torch.tensor([rank]).cuda()
|
||||
pynccl1.all_reduce(data)
|
||||
pg1.barrier()
|
||||
torch.cuda.synchronize()
|
||||
if rank <= 2:
|
||||
pynccl2.all_reduce(data)
|
||||
pg2.barrier()
|
||||
torch.cuda.synchronize()
|
||||
item = data[0].item()
|
||||
print(f"rank: {rank}, item: {item}")
|
||||
if rank == 3:
|
||||
assert item == 6
|
||||
else:
|
||||
assert item == 18
|
||||
|
||||
|
||||
def broadcast_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
if rank == 2:
|
||||
pg1.broadcast_obj("secret", src=2)
|
||||
else:
|
||||
obj = pg1.broadcast_obj(None, src=2)
|
||||
assert obj == "secret"
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
def allgather_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
data = pg1.all_gather_obj(rank)
|
||||
assert data == list(range(WORLD_SIZE))
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is flaky and prone to hang.")
|
||||
# @multi_gpu_test(num_gpus=4)
|
||||
@pytest.mark.parametrize(
|
||||
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
|
||||
def test_stateless_process_group(worker):
|
||||
port1 = get_open_port()
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("", port1))
|
||||
port2 = get_open_port()
|
||||
WORLD_SIZE = 4
|
||||
from multiprocessing import get_context
|
||||
ctx = get_context("fork")
|
||||
processes = []
|
||||
for i in range(WORLD_SIZE):
|
||||
rank = i
|
||||
processes.append(
|
||||
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
|
||||
for p in processes:
|
||||
p.start()
|
||||
for p in processes:
|
||||
p.join()
|
||||
for p in processes:
|
||||
assert not p.exitcode
|
||||
print("All processes finished.")
|
||||
Reference in New Issue
Block a user