Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,168 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import msgspec
import msgspec.msgpack
import pytest
import zmq
from vllm.config.kv_events import KVEventsConfig
from vllm.distributed.kv_events import EventPublisherFactory
from .test_events import SampleBatch
DP_RANK = 0
@pytest.fixture
def random_port():
"""Generate a random port number for testing"""
return random.randint(10000, 59900)
@pytest.fixture
def publisher_config(random_port, request):
"""Create a publisher config with inproc transport"""
how = request.param if hasattr(request, "param") else "inproc"
if how == "inproc":
endpoint = f"inproc://test-{random_port}"
replay_endpoint = endpoint + "-replay"
else:
endpoint = f"tcp://*:{random_port}"
replay_endpoint = f"tcp://*:{random_port + 100}"
return KVEventsConfig(
enable_kv_cache_events=True,
publisher="zmq",
endpoint=endpoint,
replay_endpoint=replay_endpoint,
buffer_steps=100,
hwm=1000,
topic="test",
)
@pytest.fixture
def publisher(publisher_config):
"""Create and return a publisher instance"""
pub = EventPublisherFactory.create(publisher_config, DP_RANK)
yield pub
pub.shutdown()
@pytest.fixture
def subscriber(publisher_config):
"""Create and return a subscriber for testing"""
endpoint = publisher_config.endpoint
replay_endpoint = publisher_config.replay_endpoint
if endpoint.startswith("tcp://*"):
endpoint = endpoint.replace("*", "127.0.0.1")
if replay_endpoint and replay_endpoint.startswith("tcp://*"):
replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
sub = MockSubscriber(
[endpoint],
[replay_endpoint] if replay_endpoint else None,
publisher_config.topic,
)
yield sub
sub.close()
class MockSubscriber:
"""Helper class to receive and verify published events"""
def __init__(
self,
pub_endpoints: str | list[str],
replay_endpoints: str | list[str] | None = None,
topic: str = "",
decode_type=SampleBatch,
):
self.ctx = zmq.Context.instance()
# Convert single endpoint to list for consistency
if isinstance(pub_endpoints, str):
pub_endpoints = [pub_endpoints]
if isinstance(replay_endpoints, str):
replay_endpoints = [replay_endpoints]
# Set up subscriber socket - connect to all endpoints
self.sub = self.ctx.socket(zmq.SUB)
self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8"))
for endpoint in pub_endpoints:
self.sub.connect(endpoint)
# Set up replay sockets if provided
self.replay_sockets = []
if replay_endpoints:
for replay_endpoint in replay_endpoints:
replay = self.ctx.socket(zmq.REQ)
replay.connect(replay_endpoint)
self.replay_sockets.append(replay)
self.topic = topic
self.topic_bytes = topic.encode("utf-8")
self.received_msgs: list[tuple[int, SampleBatch]] = []
self.last_seq = -1
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
"""Receive a single message with timeout"""
if not self.sub.poll(timeout):
return None
topic_bytes, seq_bytes, payload = self.sub.recv_multipart()
assert topic_bytes == self.topic_bytes
seq = int.from_bytes(seq_bytes, "big")
data = self.decoder.decode(payload)
self.last_seq = seq
self.received_msgs.append((seq, data))
return seq, data
def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
"""Request replay of messages starting from start_seq"""
if not self.replay_sockets:
raise ValueError("Replay sockets not initialized")
if socket_idx >= len(self.replay_sockets):
raise ValueError(f"Invalid socket index {socket_idx}")
self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
def receive_replay(self, socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
"""Receive replayed messages from a specific replay socket"""
if not self.replay_sockets:
raise ValueError("Replay sockets not initialized")
if socket_idx >= len(self.replay_sockets):
raise ValueError(f"Invalid socket index {socket_idx}")
replay_socket = self.replay_sockets[socket_idx]
replayed: list[tuple[int, SampleBatch]] = []
while True:
try:
if not replay_socket.poll(1000):
break
frames = replay_socket.recv_multipart()
if not frames or not frames[-1]:
# End of replay marker
break
seq_bytes, payload = frames
seq = int.from_bytes(seq_bytes, "big")
data = self.decoder.decode(payload)
replayed.append((seq, data))
except zmq.ZMQError as _:
break
return replayed
def close(self):
"""Clean up resources"""
self.sub.close()
for replay in self.replay_sockets:
replay.close()

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import torch
import torch.multiprocessing as mp
from vllm.distributed.parallel_state import (
init_distributed_environment,
)
from vllm.utils.system_utils import update_environment_variables
mp.set_start_method("spawn", force=True)
def distributed_run(fn, world_size, *args):
number_of_processes = world_size
processes: list[mp.Process] = []
for i in range(number_of_processes):
env: dict[str, str] = {}
env["RANK"] = str(i)
env["LOCAL_RANK"] = str(i)
env["WORLD_SIZE"] = str(number_of_processes)
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = mp.Process(target=fn, args=(env, world_size, *args))
processes.append(p)
p.start()
for p in processes:
p.join()
for p in processes:
assert p.exitcode == 0
def set_env_vars_and_device(env: dict[str, str]) -> None:
update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
init_distributed_environment()
# Ensure each worker process has the same random seed
random.seed(42)
torch.manual_seed(42)

View File

@@ -1,59 +0,0 @@
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.
Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest \
test_basic_distributed_correctness.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
test_basic_distributed_correctness.py
```
"""
import os
import pytest
import torch
MODELS = [
os.environ["TEST_DIST_MODEL"],
]
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
enforce_eager = False
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
if backend_by_env_var == "FLASHINFER":
enforce_eager = True
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
vllm_model = vllm_runner(model,
dtype=dtype,
tensor_parallel_size=2,
enforce_eager=enforce_eager)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# can only run on machines with p2p access across GPUs
# can only run with torchrun:
# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
import ctypes
import torch
import torch.distributed as dist
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa
CustomAllreduce,
)
# create a cpu process group for communicating metadata (ipc handle)
dist.init_process_group(backend="gloo")
rank = local_rank = dist.get_rank()
world_size = dist.get_world_size()
# every process sets its own device (differently)
lib = CudaRTLibrary()
lib.cudaSetDevice(rank)
buffer_size_in_bytes = 1024
byte_value = 2 # the value we write to the buffer for verification
pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
print(f"Rank {rank} has pointers {pointers}")
dist.barrier()
torch.cuda.synchronize()
if rank == 0:
# the first rank tries to write to all buffers
for p in pointers:
pointer = ctypes.c_void_p(p)
lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
dist.barrier()
torch.cuda.synchronize()
host_data = (ctypes.c_char * buffer_size_in_bytes)()
# all ranks read from all buffers, and check if the data is correct
for p in pointers:
pointer = ctypes.c_void_p(p)
lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
for i in range(buffer_size_in_bytes):
assert ord(host_data[i]) == byte_value, (
f"Rank {rank} failed"
f" to verify buffer {p}. Expected {byte_value}, "
f"got {ord(host_data[i])}"
)
print(f"Rank {rank} verified all buffers")
dist.barrier()
torch.cuda.synchronize()
CustomAllreduce.free_shared_buffer(pointers)

View File

@@ -1,66 +0,0 @@
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.
Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest \
test_chunked_prefill_distributed.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
test_chunked_prefill_distributed.py
```
"""
import os
import pytest
import torch
MODELS = [
os.environ["TEST_DIST_MODEL"],
]
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
) -> None:
# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
vllm_model = vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")

View File

@@ -1,53 +1,105 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test the communication operators.
Run `pytest tests/distributed/test_comm_ops.py`.
"""
import os
from collections.abc import Callable
from typing import Any
import pytest
import ray
import torch
from vllm.distributed import (broadcast_tensor_dict,
tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce)
from vllm.test_utils import (init_test_distributed_environment,
multi_process_tensor_parallel)
from vllm.distributed import (
broadcast_tensor_dict,
get_pp_group,
tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter,
)
from ..utils import (
init_test_distributed_environment,
multi_gpu_test,
multi_process_parallel,
)
@ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
def all_reduce_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"]
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8
all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
(r + 1) for r in range(tensor_parallel_size)
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
for r in range(tp_size)
]
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
t = all_tensors[rank]
t = all_tensors[rank % tp_size]
t = tensor_model_parallel_all_reduce(t)
assert torch.allclose(t, expected)
torch.testing.assert_close(t, expected)
@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
def reduce_scatter_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"]
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8
all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
for r in range(tp_size)
]
index = rank % tp_size
partition_size = num_elements // tp_size
all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
expected = all_reduce[index * partition_size : (index + 1) * partition_size]
t = all_tensors[index]
t = tensor_model_parallel_reduce_scatter(t, 0)
torch.testing.assert_close(t, expected)
@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2))
total_size = 1
@@ -55,56 +107,169 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
total_size *= s
for all_gather_dimension in range(num_dimensions):
all_tensors = [
torch.arange(total_size, dtype=torch.float32,
device="cuda").reshape(tensor_size) * (r + 1)
for r in range(tensor_parallel_size)
torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
tensor_size
)
* (r + 1)
for r in range(tp_size)
]
expected = torch.cat(all_tensors, dim=all_gather_dimension)
t = all_tensors[rank]
t = all_tensors[rank % tp_size]
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
assert torch.allclose(t, expected)
torch.testing.assert_close(t, expected)
@ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
def broadcast_tensor_dict_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"]
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = {
# device tensor
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
"b": torch.arange(16, dtype=torch.int8, device="cuda"),
# CPU tensor
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
"c": "test",
"d": [1, 2, 3],
"e": {
"a": 1,
"b": 2
},
"e": {"a": 1, "b": 2},
# empty tensor
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
}
if rank == 0:
if (rank % tp_size) == 0:
broadcast_tensor_dict(test_dict, src=0)
else:
recv_dict = broadcast_tensor_dict(src=0)
assert len(recv_dict) == len(test_dict)
assert torch.allclose(recv_dict["a"], test_dict["a"])
assert torch.allclose(recv_dict["b"], test_dict["b"])
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
assert recv_dict["c"] == test_dict["c"]
assert recv_dict["d"] == test_dict["d"]
assert recv_dict["e"] == test_dict["e"]
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("tensor_parallel_size", [2])
@pytest.mark.parametrize("test_target", [
all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker
])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
multi_process_tensor_parallel(tensor_parallel_size, test_target)
@ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = {
# device tensor
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
# CPU tensor
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
"c": "test",
"d": [1, 2, 3],
"e": {"a": 1, "b": 2},
# empty tensor
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
}
if not get_pp_group().is_first_rank:
recv_dict = get_pp_group().recv_tensor_dict()
if not get_pp_group().is_last_rank:
get_pp_group().send_tensor_dict(test_dict)
if not get_pp_group().is_first_rank:
assert len(recv_dict) == len(test_dict)
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
assert recv_dict["c"] == test_dict["c"]
assert recv_dict["d"] == test_dict["d"]
assert recv_dict["e"] == test_dict["e"]
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
@ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
size = 64
test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
if not get_pp_group().is_first_rank:
recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
if not get_pp_group().is_last_rank:
get_pp_group().send(test_tensor)
if not get_pp_group().is_first_rank:
torch.testing.assert_close(test_tensor, recv_tensor)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize(
"test_target",
[all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
)
def test_multi_process_tensor_parallel(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
)
def test_multi_process_pipeline_parallel(
monkeypatch: pytest.MonkeyPatch,
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target",
[
send_recv_test_worker,
send_recv_tensor_dict_test_worker,
all_reduce_test_worker,
all_gather_test_worker,
broadcast_tensor_dict_test_worker,
],
)
def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size: int,
pp_size: int,
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)

View File

@@ -0,0 +1,296 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple
import pytest
import torch
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
logger = init_logger("test_context_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
CP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"deepseek-ai/DeepSeek-V2-Lite-Chat",
"Qwen/Qwen2.5-1.5B-Instruct",
]
# GSM8K eval configuration
NUM_QUESTIONS = 256 # Fast eval for CI
NUM_SHOTS = 5 # Few-shot examples
# tp accuracy with 2% buffer
MIN_ACCURACY = {
# .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
"deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
# .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
"Qwen/Qwen2.5-1.5B-Instruct": 0.52,
}
class ParallelSetup(NamedTuple):
tp_size: int
pp_size: int
dcp_size: int
cp_kv_cache_interleave_size: int
eager_mode: bool
chunked_prefill: bool
class CPTestOptions(NamedTuple):
multi_node_only: bool
attn_backend: str | None = None
@dataclass
class CPTestSettings:
parallel_setups: list[ParallelSetup]
distributed_backends: list[str]
runner: RunnerOption
test_options: CPTestOptions
@staticmethod
def detailed(
*,
tp_base: int = 4,
pp_base: int = 1,
dcp_multipliers: list[float] | None = None,
cp_kv_cache_interleave_size: int = 1,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
attn_backend: str | None = None,
):
parallel_setups = []
if dcp_multipliers is None:
dcp_multipliers = [
0.5,
]
for eager_mode_val in [False]:
for pp_multiplier in [1]:
for dcp_multiplier in dcp_multipliers:
for chunked_prefill_val in [True]:
parallel_setups.append(
ParallelSetup(
tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
dcp_size=int(dcp_multiplier * tp_base),
cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val,
)
)
return CPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp"],
runner=runner,
test_options=CPTestOptions(
multi_node_only=multi_node_only,
attn_backend=attn_backend,
),
)
def iter_params(self, model_id: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend in self.distributed_backends:
yield (
model_id,
parallel_setup,
backend,
self.runner,
opts,
)
CP_TEXT_GENERATION_MODELS = {
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
CPTestSettings.detailed(dcp_multipliers=[1]),
CPTestSettings.detailed(
dcp_multipliers=[0.5],
cp_kv_cache_interleave_size=64,
attn_backend="FLASHMLA",
),
],
"Qwen/Qwen2.5-1.5B-Instruct": [
CPTestSettings.detailed(
cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
),
CPTestSettings.detailed(
cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
),
],
}
def _test_cp_gsm8k(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available: int,
*,
method: Literal["generate"],
is_multimodal: bool,
):
(
tp_size,
pp_size,
dcp_size,
cp_kv_cache_interleave_size,
eager_mode,
chunked_prefill,
) = parallel_setup
multi_node_only, attn_backend = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
server_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--max-num-seqs",
"64",
]
if chunked_prefill:
server_args.append("--enable-chunked-prefill")
if eager_mode:
server_args.append("--enforce-eager")
if runner != "auto":
server_args.extend(["--runner", runner])
if trust_remote_code:
server_args.append("--trust-remote-code")
if tokenizer_mode:
server_args.extend(["--tokenizer-mode", tokenizer_mode])
if hf_overrides:
server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
server_args.extend(
[
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--decode-context-parallel-size",
str(dcp_size),
"--dcp-kv-cache-interleave-size",
str(cp_kv_cache_interleave_size),
"--distributed-executor-backend",
distributed_backend,
]
)
server_env = {}
if attn_backend:
server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
with RemoteOpenAIServer(
model_id,
server_args,
env_dict=server_env,
max_wait_seconds=720,
) as remote_server:
host = f"http://{remote_server.host}"
port = remote_server.port
# Run GSM8K evaluation
results = evaluate_gsm8k(
num_questions=NUM_QUESTIONS,
num_shots=NUM_SHOTS,
host=host,
port=port,
)
# Validate accuracy is reasonable
accuracy = results["accuracy"]
min_accuracy = MIN_ACCURACY[model_id]
assert accuracy >= min_accuracy, (
f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
)
@pytest.mark.parametrize(
(
"model_id",
"parallel_setup",
"distributed_backend",
"runner",
"test_options",
),
[
params
for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
for setting in settings
for params in setting.iter_params(model_id)
if model_id in CP_TEST_MODELS
],
)
@create_new_process_for_each_test()
def test_cp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available,
):
if (
model_id == "deepseek-ai/DeepSeek-V2-Lite-Chat"
and torch.cuda.get_device_capability() < (9, 0)
):
pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
if (
model_id == "Qwen/Qwen2.5-1.5B-Instruct"
and torch.cuda.get_device_capability() != (9, 0)
):
pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
_test_cp_gsm8k(
model_id,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="generate",
is_multimodal=False,
)

View File

@@ -1,4 +1,6 @@
import os
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import pytest
@@ -6,10 +8,14 @@ import ray
import torch
import torch.distributed as dist
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm.distributed.device_communicators import custom_all_reduce
from vllm.test_utils import (init_test_distributed_environment,
multi_process_tensor_parallel)
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
from vllm.distributed.parallel_state import get_tp_group, graph_capture
from ..utils import (
ensure_model_parallel_initialized,
init_test_distributed_environment,
multi_process_parallel,
)
random.seed(42)
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -18,67 +24,109 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(world_size, rank, distributed_init_port):
del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, world_size, rank,
distributed_init_port)
def graph_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tp_group().device_group
custom_all_reduce.init_custom_all_reduce()
for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with custom_all_reduce.capture():
# use integers so result matches NCCL exactly
inp1 = torch.randint(1,
16, (sz, ),
dtype=dtype,
device=torch.cuda.current_device())
inp2 = torch.randint(1,
16, (sz, ),
dtype=dtype,
device=torch.cuda.current_device())
torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
out1 = tensor_model_parallel_all_reduce(inp1)
# the input buffer is immediately modified to test
# synchronization
dist.all_reduce(inp1)
out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2)
graph.replay()
assert torch.allclose(out1, inp1)
assert torch.allclose(out2, inp2)
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily
# (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for
# graph capture immediately.
data = torch.zeros(1)
data = data.to(device=device)
torch.distributed.all_reduce(data, group=group)
torch.cuda.synchronize()
del data
# we use the first group to communicate once
# and the second group to communicate twice
# and so on
# this is used to demonstrate that each group can
# communicate independently
num_communication = rank // tp_size + 1
for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly
inp1 = torch.randint(
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
inp2 = torch.randint(
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
for i in range(num_communication):
out1 = tensor_model_parallel_all_reduce(inp1)
# the input buffer is immediately modified to test
# synchronization
dist.all_reduce(inp1, group=group)
out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group)
graph.replay()
torch.testing.assert_close(out1, inp1)
torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(world_size, rank, distributed_init_port):
del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(1, world_size, rank,
distributed_init_port)
def eager_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
sz = 1024
custom_all_reduce.init_custom_all_reduce()
fa = custom_all_reduce.get_handle()
inp = torch.ones(sz, dtype=torch.float32, device=device)
out = fa.all_reduce_unreg(inp)
assert torch.allclose(out, inp * world_size)
# we use the first group to communicate once
# and the second group to communicate twice
# and so on
# this is used to demonstrate that each group can
# communicate independently
num_communication = rank // tp_size + 1
sz = 1024
fa = get_tp_group().device_communicator.ca_comm
inp = torch.ones(sz, dtype=torch.float32, device=device)
out = inp
for _ in range(num_communication):
out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication))
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = fa.all_reduce_unreg(inp)
assert torch.allclose(out, inp * world_size)
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = inp
for _ in range(num_communication):
out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("tensor_parallel_size", [2])
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
multi_process_tensor_parallel(tensor_parallel_size, test_target)
if __name__ == "__main__":
multi_process_tensor_parallel(2, graph_allreduce)
def test_custom_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)

View File

@@ -0,0 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
def test_distributed_oot(dummy_opt_path: str):
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)

View File

@@ -0,0 +1,312 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
def test_basic_rebalance():
"""Test basic rebalancing functionality"""
# Example from https://github.com/deepseek-ai/eplb
weight = torch.tensor(
[
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
]
)
num_layers = weight.shape[0]
num_replicas = 16
num_groups = 4
num_nodes = 2
num_gpus = 8
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Verify output shapes
assert phy2log.shape == (
2,
16,
), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
assert log2phy.shape[0] == 2, (
f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
)
assert log2phy.shape[1] == 12, (
f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
)
assert logcnt.shape == (
2,
12,
), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
# Verify physical to logical expert mapping range is correct
assert torch.all(phy2log >= 0) and torch.all(phy2log < 12), (
"Physical to logical mapping should be in range [0, 12)"
)
# Verify expert count reasonableness
assert torch.all(logcnt >= 1), "Each logical expert should have at least 1 replica"
assert torch.sum(logcnt, dim=1).sum() == num_replicas * num_layers, (
f"Total replicas should be {num_replicas * num_layers}"
)
# Verify expected output
expected_phy2log = torch.tensor(
[
[5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
[7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
]
)
assert torch.all(phy2log == expected_phy2log)
expected_logcnt = torch.tensor(
[[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1], [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]]
)
assert torch.all(logcnt == expected_logcnt)
def test_single_gpu_case():
"""Test single GPU case"""
weight = torch.tensor([[10, 20, 30, 40]])
num_replicas = 4
num_groups = 1
num_nodes = 1
num_gpus = 1
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Verify shapes
assert phy2log.shape == (1, 4)
assert log2phy.shape[0] == 1
assert log2phy.shape[1] == 4
assert logcnt.shape == (1, 4)
# Verify all logical experts are mapped
assert set(phy2log[0].tolist()) == {0, 1, 2, 3}
def test_equal_weights():
"""Test case with equal weights"""
weight = torch.tensor([[50, 50, 50, 50, 50, 50, 50, 50]])
num_replicas = 8
num_groups = 2
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Verify shapes
assert phy2log.shape == (1, 8)
assert logcnt.shape == (1, 8)
# With equal weights, each expert should have exactly one replica
assert torch.all(logcnt == 1), (
"With equal weights and no replication, "
"each expert should have exactly 1 replica"
)
def test_extreme_weight_imbalance():
"""Test extreme weight imbalance case"""
weight = torch.tensor([[1000, 1, 1, 1, 1, 1, 1, 1]])
num_replicas = 12
num_groups = 2
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Verify shapes
assert phy2log.shape == (1, 12)
assert logcnt.shape == (1, 8)
# Expert with highest weight (index 0) should have more replicas
assert logcnt[0, 0] > logcnt[0, 1], (
"Expert with highest weight should have more replicas"
)
def test_multiple_layers():
"""Test multiple layers case"""
weight = torch.tensor(
[
[10, 20, 30, 40, 50, 60], # First layer
[60, 50, 40, 30, 20, 10], # Second layer (opposite weight pattern)
[25, 25, 25, 25, 25, 25], # Third layer (equal weights)
]
)
num_replicas = 8
num_groups = 2
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Verify shapes
assert phy2log.shape == (3, 8)
assert logcnt.shape == (3, 6)
# Verify expert allocation is reasonable for each layer
for layer in range(3):
assert torch.all(phy2log[layer] >= 0) and torch.all(phy2log[layer] < 6), (
f"Layer {layer} physical to logical mappingshould be in range [0, 6)"
)
assert torch.sum(logcnt[layer]) == num_replicas, (
f"Layer {layer} total replicas should be {num_replicas}"
)
def test_parameter_validation():
"""Test parameter validation"""
weight = torch.tensor([[10, 20, 30, 40]])
# Test non-divisible case - this should handle normally without throwing
# errors because the function will fall back to global load balancing
# strategy
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
assert phy2log.shape == (1, 8)
assert logcnt.shape == (1, 4)
# Test cases that will actually cause errors:
# num_physical_experts not divisible by num_gpus
with pytest.raises(AssertionError):
DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4
def test_small_scale_hierarchical():
"""Test small-scale hierarchical load balancing"""
weight = torch.tensor(
[
[100, 50, 200, 75, 150, 25, 300, 80], # 8 experts
]
)
num_replicas = 12
num_groups = 4 # 4 groups, 2 experts each
num_nodes = 2 # 2 nodes
num_gpus = 4 # 4 GPUs
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Verify basic constraints
assert phy2log.shape == (1, 12)
assert logcnt.shape == (1, 8)
assert torch.sum(logcnt) == num_replicas
assert torch.all(logcnt >= 1)
# Expert with highest weight should have more replicas
max_weight_expert = torch.argmax(weight[0])
assert logcnt[0, max_weight_expert] >= 2, (
"Highest weight expert should have multiple replicas"
)
def test_global_load_balance_fallback():
"""Test global load balancing fallback case"""
# When num_groups % num_nodes != 0, should fall back to global load
# balancing
weight = torch.tensor([[10, 20, 30, 40, 50, 60]])
num_replicas = 8
num_groups = 3 # Cannot be divided evenly by num_nodes=2
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Should work normally, just using global load balancing strategy
assert phy2log.shape == (1, 8)
assert logcnt.shape == (1, 6)
assert torch.sum(logcnt) == num_replicas
@pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_device_compatibility(device):
"""Test device compatibility"""
if device == "cuda" and not torch.cuda.is_available():
pytest.skip("CUDA not available")
weight = torch.tensor([[10, 20, 30, 40]], device=device)
num_replicas = 6
num_groups = 2
num_nodes = 1
num_gpus = 2
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
# Function will convert to CPU internally, but should handle different
# device inputs normally
assert phy2log.shape == (1, 6)
assert logcnt.shape == (1, 4)
def test_additional_cases():
"""Test more edge cases and different parameter combinations"""
# Test case 1: Large-scale distributed setup
weight1 = torch.tensor(
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
)
phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
weight1, 24, 8, 4, 8
)
assert phy2log1.shape == (1, 24)
assert logcnt1.shape == (1, 16)
assert torch.sum(logcnt1) == 24
# Test case 2: Different weight distributions
weight2 = torch.tensor(
[
[200, 150, 100, 50, 25, 12], # Decreasing weights
[12, 25, 50, 100, 150, 200], # Increasing weights
]
)
phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
weight2, 10, 3, 1, 2
)
assert phy2log2.shape == (2, 10)
assert logcnt2.shape == (2, 6)
# Verify high-weight experts have more replicas
for layer in range(2):
max_weight_idx = torch.argmax(weight2[layer])
assert logcnt2[layer, max_weight_idx] >= 2
if __name__ == "__main__":
weight = torch.tensor(
[
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
]
)
num_replicas = 16
num_groups = 4
num_nodes = 2
num_gpus = 8
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
print(phy2log)
test_basic_rebalance()

View File

@@ -0,0 +1,607 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import random
import pytest
import torch
import torch.distributed
from vllm.distributed.eplb.rebalance_execute import (
move_from_buffer,
rearrange_expert_weights_inplace,
transfer_layer,
)
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_tp_group,
)
from .eplb_utils import distributed_run, set_env_vars_and_device
def create_expert_indices_with_redundancy(
num_layers: int,
num_logical_experts: int,
total_physical_experts: int,
redundancy_config: list[int], # redundancy for each logical expert
) -> torch.Tensor:
"""
Create expert indices with redundancy.
Args:
num_layers: number of layers
num_logical_experts: number of logical experts
total_physical_experts: total number of physical experts
redundancy_config: redundancy for each logical expert
Returns:
indices: Shape (num_layers, total_physical_experts)
"""
assert sum(redundancy_config) == total_physical_experts
assert len(redundancy_config) == num_logical_experts
indices = torch.zeros(num_layers, total_physical_experts, dtype=torch.long)
for layer in range(num_layers):
physical_pos = 0
for logical_expert_id, redundancy in enumerate(redundancy_config):
for _ in range(redundancy):
indices[layer, physical_pos] = logical_expert_id
physical_pos += 1
# Shuffle the indices at dim 1
for layer in range(num_layers):
indices[layer] = indices[layer][torch.randperm(indices.shape[1])]
return indices
def create_expert_weights(
num_layers: int,
num_local_experts: int,
hidden_sizes: list[int],
rank: int,
device: torch.device,
physical_to_logical_mapping: torch.Tensor,
) -> list[list[torch.Tensor]]:
"""
Create fake expert weights tensor for testing.
Use `arange` to generate predictable weights values, based on logical
expert ID.
All replicas of the same logical expert should have the same weights.
Args:
physical_to_logical_mapping: Shape (num_layers, num_local_experts)
mapping[layer, physical_pos] = logical_expert_id
"""
expert_weights = []
for layer in range(num_layers):
layer_weights = []
for weight_idx, hidden_size in enumerate(hidden_sizes):
weight_tensor = torch.zeros(
num_local_experts, hidden_size, device=device, dtype=torch.float32
)
for local_expert in range(num_local_experts):
# Get the logical expert ID for this physical expert
global_pos = rank * num_local_experts + local_expert
logical_expert_id = physical_to_logical_mapping[
layer, global_pos
].item()
# Generate weights based on logical expert ID
# (so that all replicas of the same logical expert have the
# same weights)
base_value = logical_expert_id * 1000 + layer * 100 + weight_idx * 10
weight_tensor[local_expert] = torch.arange(
base_value,
base_value + hidden_size,
device=device,
dtype=torch.float32,
)
layer_weights.append(weight_tensor)
expert_weights.append(layer_weights)
return expert_weights
def create_redundancy_config(
num_logical_experts: int,
num_physical_experts: int,
) -> list[int]:
"""Create a redundancy configuration."""
redundancy_config = [1] * num_logical_experts
remaining = num_physical_experts - num_logical_experts
# Randomly assign the remaining physical experts to the logical experts
for _ in range(remaining):
redundancy_config[random.choice(range(num_logical_experts))] += 1
return redundancy_config
def verify_expert_weights_after_shuffle(
expert_weights: list[list[torch.Tensor]],
new_indices: torch.Tensor,
hidden_sizes: list[int],
ep_rank: int,
num_local_experts: int,
):
"""Verify the weights after shuffling are correct."""
num_layers = len(expert_weights)
for layer in range(num_layers):
for weight_idx, hidden_size in enumerate(hidden_sizes):
weight_tensor = expert_weights[layer][weight_idx]
for local_expert in range(num_local_experts):
# Calculate the global expert ID for this local expert
global_pos = ep_rank * num_local_experts + local_expert
expected_logical_expert = new_indices[layer, global_pos].item()
# Check if the weights are correct
actual_weights = weight_tensor[local_expert]
expected_base = (
expected_logical_expert * 1000 + layer * 100 + weight_idx * 10
)
expected_weights = torch.arange(
expected_base,
expected_base + hidden_size,
device=actual_weights.device,
dtype=actual_weights.dtype,
)
torch.testing.assert_close(
actual_weights,
expected_weights,
msg=f"Layer {layer}, weight {weight_idx},"
f"local expert {local_expert}: "
f"weights do not match. "
f"Expected logical expert {expected_logical_expert}",
)
def verify_redundant_experts_have_same_weights(
expert_weights: list[list[torch.Tensor]],
indices: torch.Tensor,
hidden_sizes: list[int],
world_size: int,
num_local_experts: int,
):
"""
Verify that all replicas of the same logical expert have the same weights.
"""
num_layers = len(expert_weights)
total_physical_experts = world_size * num_local_experts
for layer in range(num_layers):
# Collect weights for all physical experts for each weight matrix
all_weights: list[torch.Tensor] = []
for weight_idx, hidden_size in enumerate(hidden_sizes):
# Create tensor to store all expert weights
# Shape: [total_physical_experts, hidden_size]
gathered_weights = torch.zeros(
total_physical_experts,
hidden_size,
device=expert_weights[layer][weight_idx].device,
dtype=expert_weights[layer][weight_idx].dtype,
)
# Use all_gather to collect expert weights from current node
# expert_weights[layer][weight_idx] shape:
# [num_local_experts, hidden_size]
local_weights = expert_weights[layer][
weight_idx
] # [num_local_experts, hidden_size]
# Split tensor along dim 0 into a list for all_gather
gathered_weights_list = torch.chunk(gathered_weights, world_size, dim=0)
torch.distributed.all_gather(
# Output list: each element corresponds to one rank's weights
list(gathered_weights_list),
local_weights, # Input: current rank's local weights
)
all_weights.append(gathered_weights)
# Verify that all replicas of the same logical expert have the same
# weights
logical_expert_weights: dict[int, dict[int, torch.Tensor]] = {}
for physical_pos in range(total_physical_experts):
logical_expert_id = int(indices[layer, physical_pos].item())
if logical_expert_id not in logical_expert_weights:
# First time encountering this logical expert, save its weights
logical_expert_weights[logical_expert_id] = {
weight_idx: all_weights[weight_idx][physical_pos]
for weight_idx in range(len(hidden_sizes))
}
else:
# Verify that current physical expert's weights match the
# previously saved logical expert weights
for weight_idx in range(len(hidden_sizes)):
torch.testing.assert_close(
all_weights[weight_idx][physical_pos],
logical_expert_weights[logical_expert_id][weight_idx],
msg=f"Layer {layer}, weight {weight_idx},"
f"logical expert {logical_expert_id}: "
f"Physical expert {physical_pos} has different weights"
f"than expected",
)
def _test_async_transfer_layer_without_mtp_worker(
env,
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
tp_group = get_tp_group()
ep_group = tp_group.device_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
total_physical_experts = world_size * num_local_experts
hidden_sizes = [16, 32]
redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
old_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
redundancy_config,
)
new_redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
new_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
new_redundancy_config,
)
expert_weights = create_expert_weights(
num_layers,
num_local_experts,
hidden_sizes,
ep_rank,
device,
old_indices,
)
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
cuda_stream = torch.cuda.Stream(device=device)
for layer_idx in range(num_layers):
is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
transfer_layer(
old_global_expert_indices=old_indices,
new_global_expert_indices=new_indices,
expert_weights=expert_weights,
expert_weights_buffer=expert_buffer,
ep_group=ep_group,
layer=layer_idx,
cuda_stream=cuda_stream,
)
)
cuda_stream.synchronize()
move_from_buffer(
expert_weights=expert_weights[layer_idx],
expert_weights_buffer=expert_buffer,
is_unchanged=is_unchanged,
is_received_locally=is_received_locally,
experts_recv_loc=experts_recv_loc,
new_indices=new_indices[layer_idx].tolist(),
ep_group=ep_group,
)
verify_expert_weights_after_shuffle(
expert_weights,
new_indices,
hidden_sizes,
ep_rank,
num_local_experts,
)
verify_redundant_experts_have_same_weights(
expert_weights,
new_indices,
hidden_sizes,
world_size,
num_local_experts,
)
def _test_rearrange_expert_weights_with_redundancy(
env, world_size, num_layers, num_local_experts, num_logical_experts
) -> None:
# Initialize model parallel (using tensor parallel as an entrypoint
# to expert parallel)
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
ep_group = get_tp_group().cpu_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
# Test parameters
total_physical_experts = world_size * num_local_experts
hidden_sizes = [32, 64] # Two different weight matrices
# Create old expert indices (with redundancy)
redundancy_config = create_redundancy_config(
num_logical_experts, total_physical_experts
)
old_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
redundancy_config,
)
# Create new expert indices (with redundancy)
new_redundancy_config = create_redundancy_config(
num_logical_experts, total_physical_experts
)
new_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
new_redundancy_config,
)
# Create expert weights
expert_weights = create_expert_weights(
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
)
# Execute weight rearrangement
rearrange_expert_weights_inplace(
old_indices,
new_indices,
expert_weights,
ep_group,
is_profile=False,
)
# Verify the rearrangement result
verify_expert_weights_after_shuffle(
expert_weights,
new_indices,
hidden_sizes,
ep_rank,
num_local_experts,
)
verify_redundant_experts_have_same_weights(
expert_weights,
new_indices,
hidden_sizes,
world_size,
num_local_experts,
)
@pytest.mark.parametrize(
"world_size,num_layers,num_local_experts,num_logical_experts",
[
# 2 GPU, 2 experts per GPU
# 3 logical experts, 4 physical experts, 1 redundant experts
(2, 1, 2, 3),
# 2 GPU, 3 experts per GPU
# 4 logical experts, 6 physical experts, 2 redundant experts
(2, 2, 3, 4),
# 2 GPU, 8 experts per GPU
# 16 logical experts, 16 physical experts, 0 redundant experts
(2, 4, 8, 16),
# 4 GPU, 2 experts per GPU
# 6 logical experts, 8 physical experts, 2 redundant experts
(4, 1, 2, 6),
# 4 GPU, 2 experts per GPU
# 5 logical experts, 8 physical experts, 3 redundant experts
(4, 2, 2, 5),
# 4 GPU, 8 experts per GPU
# 16 logical experts, 32 physical experts, 16 redundant experts
(4, 8, 8, 16),
],
)
def test_rearrange_expert_weights_with_redundancy(
world_size, num_layers, num_local_experts, num_logical_experts
):
"""Test the functionality of rearranging expert weights with redundancy."""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_rearrange_expert_weights_with_redundancy,
world_size,
num_layers,
num_local_experts,
num_logical_experts,
)
def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
ep_group = get_tp_group().cpu_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
num_layers = 2
num_local_experts = 2
total_physical_experts = world_size * num_local_experts
num_logical_experts = total_physical_experts // 2 # Some redundancy
hidden_sizes = [32, 64]
# Create redundancy configuration
redundancy_config = [2] * num_logical_experts
# Same indices - no change
indices = create_expert_indices_with_redundancy(
num_layers, num_logical_experts, total_physical_experts, redundancy_config
)
expert_weights = create_expert_weights(
num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
)
# Save original weights
original_weights = []
for layer_weights in expert_weights:
layer_copy = []
for weight in layer_weights:
layer_copy.append(weight.clone())
original_weights.append(layer_copy)
# Execute rearrangement (should be no change)
rearrange_expert_weights_inplace(
indices,
indices, # Same indices
expert_weights,
ep_group,
is_profile=False,
)
# Verify that the weights have not changed
for layer in range(num_layers):
for weight_idx in range(len(hidden_sizes)):
torch.testing.assert_close(
expert_weights[layer][weight_idx],
original_weights[layer][weight_idx],
msg=f"""Layer {layer}, weight {weight_idx}
should remain unchanged""",
)
@pytest.mark.parametrize(
"world_size,num_layers,num_local_experts,num_logical_experts",
[
(2, 2, 2, 3),
],
)
def test_async_transfer_layer_without_mtp(
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
):
"""Exercise async EPLB transfer path without MTP/spec decode."""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_async_transfer_layer_without_mtp_worker,
world_size,
num_layers,
num_local_experts,
num_logical_experts,
)
@pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_no_change(world_size):
"""
Test that when the indices do not change, the weights should remain
unchanged.
"""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_no_change, world_size)
def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
ep_group = get_tp_group().cpu_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
num_layers = 1
num_local_experts = 2
total_physical_experts = world_size * num_local_experts
num_logical_experts = total_physical_experts // 2
hidden_sizes = [32]
# Create different index distributions
old_redundancy = create_redundancy_config(
num_logical_experts, total_physical_experts
)
new_redundancy = create_redundancy_config(
num_logical_experts, total_physical_experts
)
old_indices = create_expert_indices_with_redundancy(
num_layers, num_logical_experts, total_physical_experts, old_redundancy
)
new_indices = create_expert_indices_with_redundancy(
num_layers, num_logical_experts, total_physical_experts, new_redundancy
)
expert_weights = create_expert_weights(
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
)
# Save original weights
original_weights = []
for layer_weights in expert_weights:
layer_copy = []
for weight in layer_weights:
layer_copy.append(weight.clone())
original_weights.append(layer_copy)
# Execute profile mode rearrangement
rearrange_expert_weights_inplace(
old_indices,
new_indices,
expert_weights,
ep_group,
is_profile=True, # Profile mode
)
# In profile mode, the weights should remain unchanged
for layer in range(num_layers):
for weight_idx in range(len(hidden_sizes)):
torch.testing.assert_close(
expert_weights[layer][weight_idx],
original_weights[layer][weight_idx],
msg="In profile mode, the weights should remain unchanged",
)
@pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_profile_mode(world_size):
"""Test profile mode (should not copy actual weights)"""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)

View File

@@ -0,0 +1,285 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Test that the interaction between EPLB and FusedMoE Layer is okay
from dataclasses import dataclass
import pytest
import torch
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_tp_group,
)
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from .eplb_utils import distributed_run, set_env_vars_and_device
@dataclass
class TestConfig:
num_layers: int
num_experts: int
num_local_experts: int
num_topk: int
hidden_size: int
intermediate_size: int
weight_dtype: torch.dtype
weight_scale_dtype: torch.dtype | None
column_major_scales: bool
def make_expert_weights(
layer_idx: int,
global_expert_idx: int,
global_num_experts: int,
tensor_shape: tuple[int, ...],
tensor_dtype: torch.dtype,
tensor_device: torch.device,
is_column_major: bool,
) -> torch.Tensor:
assert len(tensor_shape) == 2
if is_column_major:
tensor_shape = (tensor_shape[1], tensor_shape[0])
x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
x.view(-1).copy_(
torch.arange(
value_offset,
value_offset + x.numel(),
dtype=tensor_dtype,
device=tensor_device,
)
)
if is_column_major:
x = torch.transpose(x, 1, 0)
assert not x.is_contiguous()
return x
def make_fused_moe_layer(
rank: int,
layer_idx: int,
test_config: TestConfig,
) -> FusedMoE:
fml = FusedMoE(
num_experts=test_config.num_experts,
top_k=test_config.num_topk,
hidden_size=test_config.hidden_size,
intermediate_size=test_config.intermediate_size,
prefix=f"dummy_layer_{layer_idx}",
activation="silu",
is_act_and_mul=True,
params_dtype=test_config.weight_dtype,
)
device = torch.device(f"cuda:{rank}")
from functools import partial
_make_expert_weights = partial(
make_expert_weights,
layer_idx=layer_idx,
global_num_experts=test_config.num_experts,
tensor_device=device,
)
assert isinstance(fml.w13_weight.data, torch.Tensor)
assert isinstance(fml.w2_weight.data, torch.Tensor)
fml.w13_weight.data = fml.w13_weight.data.to(device=device)
fml.w2_weight.data = fml.w2_weight.data.to(device=device)
w13_weight = fml.w13_weight.data
w2_weight = fml.w2_weight.data
assert w13_weight.size(0) == test_config.num_local_experts
for i in range(test_config.num_local_experts):
g_i = rank * test_config.num_local_experts + i
w13_weight_e = w13_weight[i]
w2_weight_e = w2_weight[i]
w13_weight_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w13_weight_e.shape,
tensor_dtype=w13_weight_e.dtype,
is_column_major=False,
)
)
w2_weight_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w2_weight_e.shape,
tensor_dtype=w2_weight_e.dtype,
is_column_major=False,
)
)
block_size = 16
def block_quant_scales_shape(
shape: tuple[int, ...], is_column_major: bool
) -> tuple[int, ...]:
assert len(shape) == 3
if not is_column_major:
return (shape[0], shape[1] // block_size, shape[2] // block_size)
else:
return (shape[0], shape[2] // block_size, shape[1] // block_size)
is_column_major = test_config.column_major_scales
w13_weight_scale_inv = torch.empty(
block_quant_scales_shape(w13_weight.shape, is_column_major),
dtype=test_config.weight_dtype,
device=device,
)
w2_weight_scale_inv = torch.empty(
block_quant_scales_shape(w2_weight.shape, is_column_major),
dtype=test_config.weight_dtype,
device=device,
)
for i in range(test_config.num_local_experts):
g_i = rank * test_config.num_local_experts + i
w13_s_e = w13_weight_scale_inv[i]
w2_s_e = w2_weight_scale_inv[i]
w13_s_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w13_s_e.shape,
tensor_dtype=w13_s_e.dtype,
# Fill data in row-major and then
# transpose if test_config requires col-major.
is_column_major=False,
)
)
w2_s_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w2_s_e.shape,
tensor_dtype=w2_s_e.dtype,
is_column_major=False,
)
)
if is_column_major:
w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
assert not w13_weight_scale_inv.is_contiguous()
assert not w2_weight_scale_inv.is_contiguous()
# Add scales to the parameter list
fml.w13_weight_scale_inv = torch.nn.Parameter(
w13_weight_scale_inv, requires_grad=False
)
fml.w2_weight_scale_inv = torch.nn.Parameter(
w2_weight_scale_inv, requires_grad=False
)
return fml
def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
# Initialize model parallel (using tensor parallel as an entrypoint
# to expert parallel)
set_env_vars_and_device(env)
vllm_config = VllmConfig()
vllm_config.parallel_config.tensor_parallel_size = world_size
vllm_config.parallel_config.enable_expert_parallel = True
with set_current_vllm_config(vllm_config):
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
ep_group = get_tp_group().cpu_group
ep_rank = torch.distributed.get_rank()
fml_layers = [
make_fused_moe_layer(ep_rank, layer_idx, test_config)
for layer_idx in range(test_config.num_layers)
]
rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
indices = torch.zeros(
test_config.num_layers, test_config.num_experts, dtype=torch.long
)
for lidx in range(test_config.num_layers):
indices[lidx] = torch.Tensor(range(test_config.num_experts))
shuffled_indices = torch.zeros_like(indices)
for lidx in range(test_config.num_layers):
shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
rearrange_expert_weights_inplace(
indices,
shuffled_indices,
rank_expert_weights,
ep_group,
is_profile=False,
)
num_local_experts = test_config.num_local_experts
num_global_experts = test_config.num_experts
for lidx, fml in enumerate(fml_layers):
for name, w in fml.named_parameters():
for e in range(num_local_experts):
g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
ref = make_expert_weights(
layer_idx=lidx,
global_expert_idx=int(g_e.item()),
global_num_experts=num_global_experts,
tensor_shape=w[e].shape,
tensor_dtype=w[e].dtype,
tensor_device=w[e].device,
is_column_major=not w[e].is_contiguous(),
)
assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
f"w[{e}] {w[e].size()} {w[e].stride()} vs "
f"ref {ref.size()} {ref.stride()}"
)
torch.testing.assert_close(w[e], ref)
@pytest.mark.parametrize("world_size", [2])
@pytest.mark.parametrize("num_layers", [4])
@pytest.mark.parametrize("num_experts", [16])
@pytest.mark.parametrize("hidden_size", [256])
@pytest.mark.parametrize("intermediate_size", [256])
@pytest.mark.parametrize("column_major_scales", [True, False])
def test_eplb_fml(
world_size: int,
num_layers: int,
num_experts: int,
hidden_size: int,
intermediate_size: int,
column_major_scales: bool,
):
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
num_local_experts = num_experts // world_size
num_topk = 4
# The dtypes are fine as we are essentially just checking data-copies
weight_dtype = torch.bfloat16
weight_scale_dtype = torch.bfloat16
test_config = TestConfig(
num_layers=num_layers,
num_experts=num_experts,
num_local_experts=num_local_experts,
num_topk=num_topk,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
weight_dtype=weight_dtype,
weight_scale_dtype=weight_scale_dtype,
column_major_scales=column_major_scales,
)
distributed_run(
_test_eplb_fml,
world_size,
test_config,
)

View File

@@ -0,0 +1,276 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Test that the interaction between EPLB and FusedMoE Layer is okay for DP w/ NVFP4
from dataclasses import dataclass
import pytest
import torch
from tests.kernels.moe.utils import make_test_quant_config
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_dp_group,
)
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptNvFp4Config,
ModelOptNvFp4FusedMoE,
)
from .eplb_utils import distributed_run, set_env_vars_and_device
@dataclass
class TestConfig:
num_layers: int
num_experts: int
num_local_experts: int
num_topk: int
hidden_size: int
intermediate_size: int
num_tokens: int
def make_fused_moe_layer(
rank: int,
layer_idx: int,
test_config: TestConfig,
) -> FusedMoE:
quant_config = None
device = torch.device(f"cuda:{rank}")
quant_config = ModelOptNvFp4Config(
is_checkpoint_nvfp4_serialized=True,
kv_cache_quant_algo=None,
exclude_modules=[],
)
fml = FusedMoE(
num_experts=test_config.num_experts,
top_k=test_config.num_topk,
hidden_size=test_config.hidden_size,
intermediate_size=test_config.intermediate_size,
prefix=f"dummy_layer_{layer_idx}",
activation="silu",
is_act_and_mul=True,
params_dtype=torch.bfloat16,
quant_config=quant_config,
)
nvfp4_fused_moe = ModelOptNvFp4FusedMoE(quant_config, fml)
nvfp4_fused_moe.create_weights(
fml,
test_config.num_local_experts,
test_config.hidden_size,
test_config.intermediate_size,
params_dtype=torch.uint8,
global_num_experts=test_config.num_experts,
)
fml = fml.to(device)
w1_q, w2_q, quant_config = make_test_quant_config(
test_config.num_local_experts,
test_config.intermediate_size,
test_config.hidden_size,
in_dtype=torch.bfloat16,
quant_dtype="nvfp4",
block_shape=None,
per_act_token_quant=False,
)
fml.w13_weight.data = w1_q
fml.w2_weight.data = w2_q
fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5
fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5
fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5
fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5
fml.w2_weight_scale.data = (
torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5
).to(fml.w2_weight_scale.data.dtype)
fml.w13_weight_scale.data = (
torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5
).to(fml.w13_weight_scale.data.dtype)
nvfp4_fused_moe.process_weights_after_loading(fml)
fml.maybe_init_modular_kernel()
return fml
def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
set_env_vars_and_device(env)
vllm_config = VllmConfig()
vllm_config.parallel_config.data_parallel_size = world_size
vllm_config.parallel_config.enable_expert_parallel = True
with set_current_vllm_config(vllm_config):
ensure_model_parallel_initialized(
tensor_model_parallel_size=1, pipeline_model_parallel_size=1
)
ep_group = get_dp_group().cpu_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
fml_layers = [
make_fused_moe_layer(ep_rank, layer_idx, test_config).to(device)
for layer_idx in range(test_config.num_layers)
]
rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
hidden_states = []
router_logits = []
for layer_idx in range(test_config.num_layers):
hidden_states.append(
torch.randn(
(test_config.num_tokens, test_config.hidden_size),
dtype=torch.bfloat16,
device=device,
)
)
router_logits.append(
torch.randn(
(test_config.num_tokens, test_config.num_experts),
dtype=torch.bfloat16,
device=device,
)
)
out_before_shuffle = []
with set_forward_context(
{},
num_tokens=test_config.num_tokens,
num_tokens_across_dp=torch.tensor(
[test_config.num_tokens] * world_size, device="cpu", dtype=torch.int
),
vllm_config=vllm_config,
):
for lidx, fml in enumerate(fml_layers):
out_before_shuffle.append(
fml(hidden_states[lidx].clone(), router_logits[lidx].clone())
)
indices = torch.zeros(
test_config.num_layers, test_config.num_experts, dtype=torch.long
)
for lidx in range(test_config.num_layers):
indices[lidx] = torch.Tensor(range(test_config.num_experts))
shuffled_indices = torch.zeros_like(indices)
for lidx in range(test_config.num_layers):
shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
rearrange_expert_weights_inplace(
indices,
shuffled_indices,
rank_expert_weights,
ep_group,
is_profile=False,
)
num_global_experts = test_config.num_experts
logical_to_physical_map_list = []
for lidx, fml in enumerate(fml_layers):
physical_to_logical_map = shuffled_indices[lidx].to(device)
logical_to_physical_map = torch.empty(
(num_global_experts,), dtype=torch.int32, device=device
)
logical_to_physical_map[physical_to_logical_map] = torch.arange(
0, num_global_experts, dtype=torch.int32, device=device
)
logical_to_physical_map_list.append(
logical_to_physical_map.reshape(num_global_experts, 1)
)
logical_to_physical_map = torch.stack(logical_to_physical_map_list)
for lidx, fml in enumerate(fml_layers):
logical_replica_count = torch.ones(
(test_config.num_layers, num_global_experts),
dtype=torch.int32,
device=device,
)
fml.enable_eplb = True
fml.set_eplb_state(
lidx,
torch.zeros(
(test_config.num_layers, num_global_experts),
dtype=torch.int32,
device=device,
),
logical_to_physical_map,
logical_replica_count,
)
out_after_shuffle = []
with set_forward_context(
{},
num_tokens=test_config.num_tokens,
num_tokens_across_dp=torch.tensor(
[test_config.num_tokens] * world_size, device="cpu", dtype=torch.int
),
vllm_config=vllm_config,
):
for lidx, fml in enumerate(fml_layers):
out_after_shuffle.append(
fml(hidden_states[lidx].clone(), router_logits[lidx].clone())
)
for lidx in range(test_config.num_layers):
torch.testing.assert_close(
out_before_shuffle[lidx], out_after_shuffle[lidx], atol=1e-1, rtol=1e-1
)
@pytest.mark.parametrize("world_size", [2, 4])
@pytest.mark.parametrize("num_layers", [8])
@pytest.mark.parametrize("num_experts", [32])
@pytest.mark.parametrize("hidden_size", [256])
@pytest.mark.parametrize("intermediate_size", [256])
@pytest.mark.parametrize("num_tokens", [256])
@pytest.mark.parametrize("backend", ["latency", "throughput"])
def test_eplb_fml(
world_size: int,
num_layers: int,
num_experts: int,
hidden_size: int,
intermediate_size: int,
num_tokens: int,
backend: str,
monkeypatch,
):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend)
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
num_local_experts = num_experts // world_size
num_topk = 4
test_config = TestConfig(
num_layers=num_layers,
num_experts=num_experts,
num_local_experts=num_local_experts,
num_topk=num_topk,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_tokens=num_tokens,
)
distributed_run(
_test_eplb_fml,
world_size,
test_config,
)

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import lm_eval
import pytest
from tests.utils import large_gpu_mark
from vllm.platforms import current_platform
def get_model_args(
model_name: str,
spec_model_name: str | None,
spec_method: str,
tp_size: int,
model_max_len: int,
use_async: bool = False,
) -> dict:
speculative_config = {
"method": spec_method,
"model": spec_model_name,
"num_speculative_tokens": 1,
"max_model_len": model_max_len,
}
eplb_config = {
"num_redundant_experts": tp_size,
"window_size": 128,
"step_interval": 1024,
"log_balancedness": False,
}
if use_async:
eplb_config["use_async"] = True
model_args = {
"pretrained": model_name,
"dtype": "auto",
"add_bos_token": True,
"tensor_parallel_size": tp_size,
"gpu_memory_utilization": 0.7,
"speculative_config": speculative_config,
"enable_expert_parallel": True,
"eplb_config": eplb_config,
"enable_eplb": True,
"max_model_len": model_max_len,
}
return model_args
pytestmark = pytest.mark.skipif(
current_platform.is_rocm(),
reason="EPLB with Spec Decode is a work in progress on ROCm.",
)
@pytest.mark.parametrize(
"model_setup",
[
pytest.param(
("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86),
marks=large_gpu_mark(min_gb=80),
),
pytest.param(
(
"eagle",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
4,
0.92,
),
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
),
],
ids=["qwen3_next_mtp", "llama4_eagle"],
)
def test_eplb_spec_decode(
monkeypatch: pytest.MonkeyPatch,
model_setup: tuple[str, str, str, int, float],
):
"""
Test the correctness of EPLB speculative decoding with GSM8K dataset.
Applicable to MoE models with mtp or eagle spec decode.
"""
method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
model_args = get_model_args(
model_name=model_name,
spec_model_name=spec_model_name,
spec_method=method,
tp_size=tp_size,
model_max_len=4096,
)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=TASK,
batch_size=64,
num_fewshot=8,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
@large_gpu_mark(min_gb=80)
def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
"""
Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
"""
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
expected_gsm8k_value = 0.86
model_args = get_model_args(
model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
spec_model_name=None,
spec_method="mtp",
tp_size=4,
model_max_len=4096,
use_async=True,
)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=TASK,
batch_size=64,
num_fewshot=8,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"

View File

@@ -0,0 +1,314 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import threading
import time
import msgspec
import pytest
from vllm.distributed.kv_events import (
EventBatch,
EventPublisherFactory,
NullEventPublisher,
)
DP_RANK = 0
class EventSample(
msgspec.Struct,
tag=True, # type: ignore
array_like=True, # type: ignore
):
"""Test event for publisher testing"""
id: int
value: str
class SampleBatch(EventBatch):
"""Test event batch for publisher testing"""
events: list[EventSample]
def create_test_events(count: int) -> SampleBatch:
"""Create a batch of test events"""
events = [EventSample(id=i, value=f"test-{i}") for i in range(count)]
return SampleBatch(ts=time.time(), events=events)
def test_basic_publishing(publisher, subscriber):
"""Test basic event publishing works"""
test_batch = create_test_events(5)
publisher.publish(test_batch)
result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received"
seq, received = result
assert seq == 0, "Sequence number mismatch"
assert received.ts == pytest.approx(test_batch.ts, abs=0.1), "Timestamp mismatch"
assert len(received.events) == len(test_batch.events), "Number of events mismatch"
for i, event in enumerate(received.events):
assert event.id == i, "Event id mismatch"
assert event.value == f"test-{i}", "Event value mismatch"
def test_multiple_events(publisher, subscriber):
"""Test publishing and receiving multiple event batches"""
for _ in range(10):
batch = create_test_events(2)
publisher.publish(batch)
received = []
for _ in range(10):
data = subscriber.receive_one(timeout=100)
if data:
received.append(data)
assert len(received) == 10, "Number of messages mismatch"
seqs = [seq for seq, _ in received]
assert seqs == list(range(10)), "Sequence numbers mismatch"
def test_replay_mechanism(publisher, subscriber):
"""Test the replay mechanism works correctly"""
for _ in range(19):
batch = create_test_events(1)
publisher.publish(batch)
time.sleep(0.5) # Need publisher to process above requests
subscriber.request_replay(10)
batch = create_test_events(1)
publisher.publish(batch) # 20th message
replayed = subscriber.receive_replay()
assert len(replayed) > 0, "No replayed messages received"
seqs = [seq for seq, _ in replayed]
assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
assert seqs == list(range(min(seqs), max(seqs) + 1)), (
"Replayed messages not consecutive"
)
def test_buffer_limit(publisher, subscriber, publisher_config):
"""Test buffer limit behavior"""
buffer_size = publisher_config.buffer_steps
# Publish more events than the buffer can hold
for i in range(buffer_size + 10):
batch = create_test_events(1)
publisher.publish(batch)
time.sleep(0.5) # Need publisher to process above requests
subscriber.request_replay(0)
batch = create_test_events(1)
publisher.publish(batch)
replayed = subscriber.receive_replay()
assert len(replayed) <= buffer_size, "Can't replay more than buffer size"
oldest_seq = min(seq for seq, _ in replayed)
assert oldest_seq >= 10, "The oldest sequence should be at least 10"
def test_topic_filtering(publisher_config):
"""
Test that a subscriber only receives messages matching its topic filter
"""
publisher_config.replay_endpoint = None
publisher_config.topic = "foo"
pub = EventPublisherFactory.create(publisher_config, DP_RANK)
from .conftest import MockSubscriber
sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
try:
time.sleep(0.1)
for _ in range(3):
pub.publish(create_test_events(1))
foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
assert all(msg is not None for msg in foo_received), (
"Subscriber with matching topic should receive messages"
)
bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
assert all(msg is None for msg in bar_received), (
"Subscriber with non-matching topic should receive no messages"
)
finally:
pub.shutdown()
sub_foo.close()
sub_bar.close()
def test_high_volume(publisher, subscriber):
"""Test publishing and receiving a high volume of events"""
num_batches = 10_000
events_per_batch = 100
# Publish events in a separate thread to not block
def publish_events():
for i in range(num_batches):
batch = create_test_events(events_per_batch)
publisher.publish(batch)
# Small delay to avoid overwhelming
if i % 100 == 0:
time.sleep(0.01)
received: list[tuple[int, SampleBatch]] = []
publisher_thread = threading.Thread(target=publish_events)
publisher_thread.start()
start_time = time.time()
while len(received) < num_batches:
if time.time() - start_time > 10: # Timeout after 10 seconds
break
result = subscriber.receive_one(timeout=100)
if result:
received.append(result)
publisher_thread.join()
assert len(received) >= num_batches * 0.9, "We should have received most messages"
seqs = [seq for seq, _ in received]
assert sorted(seqs) == seqs, "Sequence numbers should be in order"
def test_null_publisher():
"""Test that NullEventPublisher can be used without errors"""
publisher = NullEventPublisher(DP_RANK)
# This should not raise any errors
batch = create_test_events(5)
publisher.publish(batch)
publisher.shutdown()
def test_data_parallel_rank_tagging(publisher_config):
"""Test that events are properly tagged with their data parallel rank"""
publisher_config.topic = "foo"
pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK)
pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1)
# Hardcode the expected endpoints based on port offsetting behavior
# Both ranks get offsets according to _offset_endpoint_port function
base_endpoint = publisher_config.endpoint
if "tcp://" in base_endpoint:
# For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
expected_endpoint_0 = base_endpoint # rank 0 gets port + 0 = same port
expected_endpoint_1 = base_endpoint.replace(
":5557", ":5558"
) # rank 1 gets port + 1
else:
# For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
expected_endpoint_0 = base_endpoint # rank 0 gets base
expected_endpoint_1 = base_endpoint + "_dp1" # rank 1 gets _dp1
from .conftest import MockSubscriber
sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
try:
time.sleep(0.1) # Let publishers start up
# Publish events from different ranks
batch_0 = create_test_events(2)
batch_1 = create_test_events(3)
pub_0.publish(batch_0)
pub_1.publish(batch_1)
# Receive events from rank 0
result_0 = sub_0.receive_one(timeout=200)
assert result_0 is not None, "No message received from rank 0"
seq_0, received_0 = result_0
# Receive events from rank 1
result_1 = sub_1.receive_one(timeout=200)
assert result_1 is not None, "No message received from rank 1"
seq_1, received_1 = result_1
# Verify DP rank tagging
assert received_0.data_parallel_rank == 0, (
f"Expected DP rank 0, got {received_0.data_parallel_rank}"
)
assert received_1.data_parallel_rank == 1, (
f"Expected DP rank 1, got {received_1.data_parallel_rank}"
)
# Verify event content is correct
assert len(received_0.events) == 2, "Wrong number of events from rank 0"
assert len(received_1.events) == 3, "Wrong number of events from rank 1"
finally:
pub_0.shutdown()
pub_1.shutdown()
sub_0.close()
sub_1.close()
def test_event_publisher_factory():
"""Test event publisher factory creation behavior under different configurations"""
from vllm.config.kv_events import KVEventsConfig
from vllm.distributed.kv_events import ZmqEventPublisher
# test config is None
publisher = EventPublisherFactory.create(None, DP_RANK)
assert isinstance(publisher, NullEventPublisher)
publisher.shutdown()
# test disable kv cache events
config = KVEventsConfig(
enable_kv_cache_events=False,
publisher="zmq", # Even if zmq is specified, should return NullEventPublisher
endpoint="tcp://localhost:5557",
)
publisher = EventPublisherFactory.create(config, DP_RANK)
assert isinstance(publisher, NullEventPublisher)
publisher.shutdown()
# test zmq publisher
config = KVEventsConfig(
enable_kv_cache_events=True,
publisher="zmq",
endpoint="inproc://test-factory-true",
)
publisher = EventPublisherFactory.create(config, DP_RANK)
assert isinstance(publisher, ZmqEventPublisher)
publisher.shutdown()
# test unknown publisher
with pytest.raises(ValueError, match="Input should be"):
KVEventsConfig(
enable_kv_cache_events=True,
publisher="unknown_publisher",
endpoint="tcp://localhost:5557",
)
# test publisher not specified
config = KVEventsConfig(
enable_kv_cache_events=True,
# publisher not specified, should default to "zmq"
endpoint="tcp://localhost:5557",
)
publisher = EventPublisherFactory.create(config, DP_RANK)
assert isinstance(publisher, ZmqEventPublisher)
publisher.shutdown()

View File

@@ -0,0 +1,231 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import Literal, NamedTuple
import pytest
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from ..utils import compare_two_settings, create_new_process_for_each_test
logger = init_logger("test_expert_parallel")
class ParallelSetup(NamedTuple):
tp_size: int
eager_mode: bool
chunked_prefill: bool
class EPTestOptions(NamedTuple):
trust_remote_code: bool
tokenizer_mode: str | None
load_format: str | None = None
hf_overrides: str | None = None
@dataclass
class EPTestSettings:
parallel_setups: list[ParallelSetup]
distributed_backends: list[str]
runner: RunnerOption
test_options: EPTestOptions
@staticmethod
def detailed(
*,
tp_base: int = 2,
runner: RunnerOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: str | None = None,
load_format: str | None = None,
hf_overrides: str | None = None,
):
return EPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=False),
ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=True),
ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
ParallelSetup(
tp_size=2 * tp_base, eager_mode=False, chunked_prefill=True
),
ParallelSetup(
tp_size=2 * tp_base, eager_mode=True, chunked_prefill=False
),
],
distributed_backends=["mp", "ray"],
runner=runner,
test_options=EPTestOptions(
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides,
),
)
@staticmethod
def fast(
*,
tp_base: int = 2,
runner: RunnerOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: str | None = None,
load_format: str | None = None,
hf_overrides: str | None = None,
):
return EPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
],
distributed_backends=["mp"],
runner=runner,
test_options=EPTestOptions(
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides,
),
)
def iter_params(self, model_name: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends:
yield (
model_name,
parallel_setup,
distributed_backend,
self.runner,
opts,
)
# NOTE: You can adjust tp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
TEST_MODELS = {
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
}
def _compare_tp(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: EPTestOptions,
num_gpus_available: int,
*,
method: Literal["generate"],
):
(
tp_size,
eager_mode,
chunked_prefill,
) = parallel_setup
(
trust_remote_code,
tokenizer_mode,
load_format,
hf_overrides,
) = test_options
if num_gpus_available < tp_size:
pytest.skip(f"Need at least {tp_size} GPUs")
common_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--max-num-seqs",
"8",
"--load-format",
"auto",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if load_format:
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", hf_overrides])
ep_env = {
"VLLM_TEST_ENABLE_EP": "1",
}
ep_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
distributed_backend,
]
# compare without expert parallelism
tp_env = {
"VLLM_TEST_ENABLE_EP": "0",
}
tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
"mp",
]
try:
compare_two_settings(
model_name,
ep_args,
tp_args,
ep_env,
tp_env,
method=method,
max_wait_seconds=360,
)
except Exception:
raise
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
[
params
for model_name, settings in TEST_MODELS.items()
for params in settings.iter_params(model_name)
],
)
@create_new_process_for_each_test()
def test_ep(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: EPTestOptions,
num_gpus_available,
):
_compare_tp(
model_name,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="generate",
)

View File

@@ -0,0 +1,244 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts):
"""Verify that the expert map follows the round_robin pattern."""
# Calculate expected local experts (supporting non-divisible cases)
base_experts = global_num_experts // ep_size
remainder = global_num_experts % ep_size
local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts
# Expected expert IDs for this rank in round_robin pattern
# For non-divisible cases, ranks with extra experts start earlier
expected_expert_ids = []
for expert_idx in range(local_num_experts):
global_expert_id = ep_rank + expert_idx * ep_size
expected_expert_ids.append(global_expert_id)
# Check that only expected experts are mapped to this rank
for global_expert_id in range(global_num_experts):
if global_expert_id in expected_expert_ids:
local_expert_id = expert_map[global_expert_id]
expected_local_id = expected_expert_ids.index(global_expert_id)
assert local_expert_id == expected_local_id, (
f"Global expert {global_expert_id} should map to local expert "
f"{expected_local_id}, got {local_expert_id}"
)
else:
assert expert_map[global_expert_id] == -1, (
f"Global expert {global_expert_id} should not be mapped to this rank"
)
# Verify that all local expert IDs are consecutive starting from 0
local_expert_ids = [expert_map[global_id] for global_id in expected_expert_ids]
expected_local_ids = list(range(local_num_experts))
assert local_expert_ids == expected_local_ids, (
f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
)
@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
@pytest.mark.parametrize("world_size", [2, 4])
def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
"""Test round_robin expert placement with various expert counts."""
# Test with different global_num_experts values
# Include both divisible and non-divisible cases
if world_size == 2:
test_cases = [
(4, 2), # 4 experts (divisible)
(8, 2), # 8 experts (divisible)
(9, 2), # 9 experts (non-divisible)
(16, 2), # 16 experts (divisible)
(17, 2), # 17 experts (non-divisible)
]
elif world_size == 4:
test_cases = [
(8, 4), # 8 experts (divisible)
(16, 4), # 16 experts (divisible)
(18, 4), # 18 experts (non-divisible)
(32, 4), # 32 experts (divisible)
(33, 4), # 33 experts (non-divisible)
]
else:
test_cases = []
for test_global_experts, test_ep_size in test_cases:
# Ensure ep_size matches world_size
assert test_ep_size == world_size, (
f"ep_size {test_ep_size} must equal world_size {world_size}"
)
# Test each rank
for ep_rank in range(world_size):
# Calculate expected local experts
base_experts = test_global_experts // test_ep_size
remainder = test_global_experts % test_ep_size
if ep_rank < remainder:
expected_test_local = base_experts + 1
else:
expected_test_local = base_experts
test_local_experts, test_expert_map, _ = determine_expert_map(
ep_size=test_ep_size,
ep_rank=ep_rank,
global_num_experts=test_global_experts,
expert_placement_strategy=expert_placement_strategy,
)
assert test_local_experts == expected_test_local, (
f"For {test_global_experts} experts on {test_ep_size} ranks, "
f"rank {ep_rank}: expected {expected_test_local} local"
f"experts, got {test_local_experts}"
)
if test_expert_map is not None:
assert test_expert_map.shape == (test_global_experts,), (
f"Expected expert map shape ({test_global_experts},), "
f"got {test_expert_map.shape}"
)
# Verify round_robin pattern for this test case
verify_round_robin_pattern(
test_expert_map, ep_rank, test_ep_size, test_global_experts
)
@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
@pytest.mark.parametrize("world_size", [2, 4])
def test_expert_placement_edge_cases(expert_placement_strategy, world_size):
"""Test edge cases for round_robin expert placement."""
# Test case 1: ep_size = 1 (should return None for expert_map)
local_num_experts, expert_map, _ = determine_expert_map(
ep_size=1,
ep_rank=0,
global_num_experts=8,
expert_placement_strategy=expert_placement_strategy,
)
assert local_num_experts == 8, "For ep_size=1, should get all experts"
assert expert_map is None, "For ep_size=1, expert_map should be None"
# Test case 2: ep_size = 0 (should raise assertion)
with pytest.raises(AssertionError):
determine_expert_map(
ep_size=0,
ep_rank=0,
global_num_experts=8,
expert_placement_strategy=expert_placement_strategy,
)
def test_determine_expert_map_comprehensive():
"""Test of determine_expert_map function with various configurations."""
# Test cases: (ep_size, ep_rank, global_num_experts,
# expert_placement_strategy, expected_local, expected_map_pattern)
test_cases = [
# Round robin placement tests
(
2,
0,
8,
"round_robin",
4,
[0, -1, 1, -1, 2, -1, 3, -1],
), # rank 0 gets even experts
(
2,
1,
8,
"round_robin",
4,
[-1, 0, -1, 1, -1, 2, -1, 3],
), # rank 1 gets odd experts
(
2,
0,
9,
"round_robin",
5,
[0, -1, 1, -1, 2, -1, 3, -1, 4],
), # rank 0 gets 5 experts (even + last)
(
2,
1,
9,
"round_robin",
4,
[-1, 0, -1, 1, -1, 2, -1, 3, -1],
), # rank 1 gets 4 experts (odd)
# 4-rank tests
(
4,
0,
8,
"round_robin",
2,
[0, -1, -1, -1, 1, -1, -1, -1],
), # rank 0 gets experts 0, 4
(
4,
1,
8,
"round_robin",
2,
[-1, 0, -1, -1, -1, 1, -1, -1],
), # rank 1 gets experts 1, 5
(
4,
2,
8,
"round_robin",
2,
[-1, -1, 0, -1, -1, -1, 1, -1],
), # rank 2 gets experts 2, 6
(
4,
3,
8,
"round_robin",
2,
[-1, -1, -1, 0, -1, -1, -1, 1],
), # rank 3 gets experts 3, 7
]
for (
ep_size,
ep_rank,
global_num_experts,
expert_placement_strategy,
expected_local,
expected_map_pattern,
) in test_cases:
local_num_experts, expert_map, _ = determine_expert_map(
ep_size=ep_size,
ep_rank=ep_rank,
global_num_experts=global_num_experts,
expert_placement_strategy=expert_placement_strategy,
)
assert local_num_experts == expected_local, (
f"ep_size={ep_size}, ep_rank={ep_rank}, "
f"global_num_experts={global_num_experts}, "
f"expert_placement_strategy={expert_placement_strategy}: "
f"expected {expected_local} local experts, got {local_num_experts}"
)
if expected_map_pattern is None:
assert expert_map is None, "Expected expert_map to be None"
else:
assert expert_map is not None, "Expected expert_map to not be None"
actual_map = expert_map.tolist()
assert actual_map == expected_map_pattern, (
f"ep_size={ep_size}, ep_rank={ep_rank}, "
f"global_num_experts={global_num_experts}, "
f"expert_placement_strategy={expert_placement_strategy}: "
f"expected map {expected_map_pattern}, got {actual_map}"
)

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.config import (
DeviceConfig,
KVTransferConfig,
ModelConfig,
VllmConfig,
set_current_vllm_config,
)
from vllm.distributed.kv_transfer.kv_connector.utils import (
get_kv_connector_cache_layout,
)
from vllm.logger import init_logger
logger = init_logger("test_expert_parallel")
def test_get_kv_connector_cache_layout_without_kv_connector():
vllm_config = VllmConfig(device_config=DeviceConfig("cpu"))
with set_current_vllm_config(vllm_config):
# Test with default settings
layout = get_kv_connector_cache_layout()
assert layout == "NHD"
def test_get_kv_connector_cache_layout_with_lmcache_connector():
kv_transfer_config = KVTransferConfig(
kv_connector="LMCacheConnectorV1",
kv_role="kv_both",
)
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"), kv_transfer_config=kv_transfer_config
)
with set_current_vllm_config(vllm_config):
# Test with default settings
layout = get_kv_connector_cache_layout()
assert layout == "NHD"
def test_get_kv_connector_cache_layout_with_nixl_connector():
kv_transfer_config = KVTransferConfig(
kv_connector="NixlConnector",
kv_role="kv_both",
)
model_config = ModelConfig()
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"),
model_config=model_config,
kv_transfer_config=kv_transfer_config,
)
with set_current_vllm_config(vllm_config):
# Test with default settings
layout = get_kv_connector_cache_layout()
assert layout == "HND"
def test_get_kv_connector_cache_layout_with_multi_connector():
kv_transfer_config = KVTransferConfig(
kv_connector="MultiConnector",
kv_role="kv_both",
kv_connector_extra_config={
"connectors": [
{"kv_connector": "ExampleConnector", "kv_role": "kv_both"},
{"kv_connector": "NixlConnector", "kv_role": "kv_both"},
]
},
)
model_config = ModelConfig()
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"),
model_config=model_config,
kv_transfer_config=kv_transfer_config,
)
with set_current_vllm_config(vllm_config):
# Test with default settings
layout = get_kv_connector_cache_layout()
assert layout == "HND"

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Make sure ray assigns GPU workers to the correct node.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_multi_node_assignment.py
```
"""
import os
import pytest
import ray
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from vllm import initialize_ray_cluster
from vllm.config import ParallelConfig
from vllm.utils.network_utils import get_ip
from vllm.v1.executor.ray_utils import _wait_until_pg_removed
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.mark.skipif(
not VLLM_MULTI_NODE, reason="Need at least 2 nodes to run the test."
)
def test_multi_node_assignment() -> None:
# NOTE: important to keep this class definition here
# to let ray use cloudpickle to serialize it.
class Actor:
def get_ip(self):
return get_ip()
for _ in range(10):
config = ParallelConfig(1, 2)
initialize_ray_cluster(config)
current_ip = get_ip()
workers = []
for bundle_id, bundle in enumerate(config.placement_group.bundle_specs):
if not bundle.get("GPU", 0):
continue
scheduling_strategy = PlacementGroupSchedulingStrategy(
placement_group=config.placement_group,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=bundle_id,
)
worker = ray.remote(
num_cpus=0,
num_gpus=1,
scheduling_strategy=scheduling_strategy,
)(Actor).remote()
worker_ip = ray.get(worker.get_ip.remote())
assert worker_ip == current_ip
workers.append(worker)
for worker in workers:
ray.kill(worker)
_wait_until_pg_removed(config.placement_group)

View File

@@ -0,0 +1,437 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Integration tests for MultiprocExecutor at the executor level.
This test directly tests the executor without going through the LLM interface,
focusing on executor initialization, RPC calls, and distributed execution.
"""
import multiprocessing
import os
from tests.utils import multi_gpu_test
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_open_port
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
MODEL = "facebook/opt-125m"
def create_vllm_config(
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: int = 256,
gpu_memory_utilization: float = 0.3,
distributed_executor_backend: str = "mp",
nnodes: int = 1,
node_rank: int = 0,
master_port: int = 0,
) -> VllmConfig:
"""Create a VllmConfig for testing using EngineArgs."""
engine_args = EngineArgs(
model=MODEL,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
# Override distributed node settings if needed
if nnodes > 1 or node_rank > 0:
vllm_config.parallel_config.nnodes = nnodes
vllm_config.parallel_config.node_rank = node_rank
vllm_config.parallel_config.master_port = master_port
if nnodes > 1:
vllm_config.parallel_config.disable_custom_all_reduce = True
return vllm_config
def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
"""Create a minimal SchedulerOutput for testing."""
# This is a simplified version - in practice you'd need proper
# SchedulerOutput construction based on the actual vLLM v1 API
return SchedulerOutput(
scheduled_new_reqs=[],
scheduled_resumed_reqs=[],
scheduled_running_reqs=[],
num_scheduled_tokens={},
total_num_scheduled_tokens=0,
)
def test_multiproc_executor_initialization():
"""Test that MultiprocExecutor can be initialized with proper config."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
# Create executor - this should initialize workers
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor properties
assert executor.world_size == 1, "World size should be 1 for single GPU"
assert executor.local_world_size == 1, "Local world size should be 1"
assert hasattr(executor, "workers"), "Executor should have workers"
assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_initialization_tensor_parallel():
"""Test MultiprocExecutor initialization with tensor parallelism."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
# Create executor
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor properties
assert executor.world_size == 2, "World size should be 2 for TP=2"
assert executor.local_world_size == 2, "Local world size should be 2"
assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
# Verify output rank calculation
output_rank = executor._get_output_rank()
assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_collective_rpc():
"""Test collective RPC calls to all workers."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
# Create executor
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test check_health RPC - should work without errors
executor.check_health()
# Test that RPC works correctly
# Note: We're just testing that the RPC mechanism works,
# not testing actual model execution here
assert not executor.is_failed, "Executor should not be in failed state"
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_failure_callback():
"""Test failure callback registration and invocation."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test callback registration
callback_invoked = []
def test_callback():
callback_invoked.append(True)
# Register callback
executor.register_failure_callback(test_callback)
# Callback should not be invoked yet
assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
# Simulate failure
executor.is_failed = True
# Register another callback - should be invoked immediately
executor.register_failure_callback(test_callback)
assert len(callback_invoked) == 1, (
"Callback should be invoked when executor is failed"
)
finally:
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_worker_monitor():
"""Test that worker monitor is set up correctly."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Verify all worker processes are alive
for worker in executor.workers:
assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
# Verify executor is not in failed state
assert not executor.is_failed, "Executor should not be in failed state"
finally:
# Clean up
executor.shutdown()
# After shutdown, workers should be terminated
import time
time.sleep(0.5) # Give processes time to terminate
for worker in executor.workers:
assert not worker.proc.is_alive(), (
f"Worker rank {worker.rank} should terminate after shutdown"
)
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_get_response_message_queues():
"""Test message queue retrieval for different ranks."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Get all message queues
all_queues = executor.get_response_mqs()
assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
# Get message queue for specific rank
rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_shutdown_cleanup():
"""Test that shutdown properly cleans up resources."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor is set up
assert hasattr(executor, "workers"), "Executor should have workers"
assert len(executor.workers) > 0, "Should have at least one worker"
# Shutdown
executor.shutdown()
# Verify cleanup
import time
time.sleep(0.5) # Give processes time to terminate
for worker in executor.workers:
assert not worker.proc.is_alive(), "Worker processes should be terminated"
# Verify shutdown event is set
assert executor.shutdown_event.is_set(), "Shutdown event should be set"
# Multiple shutdowns should be safe (idempotent)
executor.shutdown()
executor.shutdown()
@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_pipeline_parallel():
"""Test MultiprocExecutor with pipeline parallelism."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=2,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Verify executor properties
assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
assert len(executor.workers) == 4, "Should have 4 workers"
# Verify output rank calculation
# For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
# Specifically rank 2 (first rank of last PP stage)
output_rank = executor._get_output_rank()
assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
# Verify max_concurrent_batches for pipeline parallel
assert executor.max_concurrent_batches == 2, (
"Max concurrent batches should equal PP size"
)
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_properties():
"""Test various executor properties and configurations."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test supports_pp property
assert MultiprocExecutor.supports_pp is True, (
"MultiprocExecutor should support pipeline parallelism"
)
# Test world_size calculation
assert executor.world_size == (
executor.parallel_config.tensor_parallel_size
* executor.parallel_config.pipeline_parallel_size
), "World size should equal TP * PP"
# Test local_world_size calculation
assert executor.local_world_size == (
executor.parallel_config.world_size // executor.parallel_config.nnodes
), "Local world size should be world_size / nnodes"
finally:
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_multi_node():
"""
Test MultiprocExecutor with multi-node configuration.
This simulates 2 nodes with TP=4:
- Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
Total world_size = 4, nnodes = 2
"""
port = get_open_port()
# symm_mem does not work for simulating multi instance in single node
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
"""Run a single node's executor."""
executor = None
try:
# Set CUDA_VISIBLE_DEVICES for this node
if node_rank == 0:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
# Create config for this node
vllm_config = create_vllm_config(
tensor_parallel_size=4, # Total TP across all nodes
pipeline_parallel_size=1,
nnodes=2, # 2 nodes
node_rank=node_rank,
master_port=port, # same port
)
# Create executor for this node
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify node-specific properties
assert executor.world_size == 4, (
f"World size should be 4 on node {node_rank}"
)
assert executor.local_world_size == 2, (
f"Local world size should be 2 on node {node_rank}"
)
assert len(executor.workers) == 2, (
f"Should have 2 local workers on node {node_rank}"
)
# Verify worker ranks are correct for this node
expected_ranks = [node_rank * 2, node_rank * 2 + 1]
actual_ranks = sorted([w.rank for w in executor.workers])
assert actual_ranks == expected_ranks, (
f"Node {node_rank} should have workers "
f"with ranks {expected_ranks}, got {actual_ranks}"
)
# Verify all workers are alive
for worker in executor.workers:
assert worker.proc.is_alive(), (
f"Worker rank {worker.rank} should be alive on node {node_rank}"
)
# executor.gen
# Put success result in queue BEFORE shutdown to avoid hanging
result_queue.put({"node": node_rank, "success": True})
import time
time.sleep(2)
executor.shutdown()
except Exception as e:
# Put failure result in queue
result_queue.put({"node": node_rank, "success": False, "error": str(e)})
raise e
finally:
if executor is not None:
executor.shutdown()
# Create a queue to collect results from both processes
result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
# Start both node processes
processes = []
for node_rank in range(2):
p = multiprocessing.Process(
target=run_node,
args=(node_rank, result_queue, port),
name=f"Node{node_rank}",
)
p.start()
processes.append(p)
# Wait for both processes to complete
all_completed = True
for p in processes:
p.join(timeout=60)
if p.is_alive():
p.terminate()
p.join(timeout=20)
if p.is_alive():
p.kill()
p.join()
all_completed = False
# Check results from both nodes
results: list[dict[str, int | bool]] = []
while len(results) < 2:
try:
result = result_queue.get(timeout=1)
results.append(result)
except Exception:
pass
assert all_completed, "Not all processes completed successfully"
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
assert results[0]["success"], f"Node 0 failed: {results[0]}"
assert results[1]["success"], f"Node 1 failed: {results[1]}"

View File

@@ -0,0 +1,94 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import typing
import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import vllm.envs as envs
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
from vllm.distributed.device_communicators.pynccl_allocator import (
get_nccl_mem_pool,
is_symmetric_memory_enabled,
)
from vllm.distributed.parallel_state import (
get_tp_group,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
torch.manual_seed(42)
random.seed(44)
test_size_elements = 4 * 1024 * 1024
def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
monkeypatch = pytest.MonkeyPatch()
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
dtype = torch.bfloat16
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
update_environment_variables(
{
"RANK": str(local_rank),
"LOCAL_RANK": str(local_rank),
"WORLD_SIZE": str(world_size),
"MASTER_ADDR": "localhost",
"MASTER_PORT": "12345",
}
)
init_distributed_environment()
initialize_model_parallel(tensor_model_parallel_size=world_size)
cuda_communicator = typing.cast(
CudaCommunicator, get_tp_group().device_communicator
)
pynccl_comm = cuda_communicator.pynccl_comm
if get_nccl_mem_pool() is None:
pytest.skip(
"NCCL allocator compilation failed (probably missing NCCL headers)."
)
if not is_symmetric_memory_enabled():
pytest.skip("NCCL symmetric memory allreduce is disabled.")
register_nccl_symmetric_ops(pynccl_comm)
input = torch.randint(1, 23, (test_size_elements,), dtype=dtype, device=device)
input_clone = input.clone()
output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
assert output is not None
group = get_tp_group().device_group
dist.all_reduce(input_clone, group=group)
torch.testing.assert_close(output, input_clone, atol=2.5, rtol=0.1)
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
)
@pytest.mark.parametrize("world_size", [2])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
# Enable SymmMemCommunicator
monkeypatch.setenv("VLLM_USE_NCCL_SYMM_MEM", "1")
monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")
mp.spawn(nccl_symm_mem_allreduce_worker, args=(world_size,), nprocs=world_size)
cleanup_dist_env_and_memory()

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import torch.distributed as dist
from vllm.distributed.parallel_state import _node_count
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils.network_utils import get_ip, get_open_port
if __name__ == "__main__":
dist.init_process_group(backend="gloo")
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
port = get_open_port()
ip = get_ip()
dist.broadcast_object_list([ip, port], src=0)
else:
recv = [None, None]
dist.broadcast_object_list(recv, src=0)
ip, port = recv
stateless_pg = StatelessProcessGroup.create(ip, port, rank, world_size)
for pg in [dist.group.WORLD, stateless_pg]:
test_result = _node_count(pg)
# Expected node count based on environment variable)
expected = int(os.environ.get("NUM_NODES", "1"))
assert test_result == expected, f"Expected {expected} nodes, got {test_result}"
if pg == dist.group.WORLD:
print(
f"Node count test passed! Got {test_result} nodes "
f"when using torch distributed!"
)
else:
print(
f"Node count test passed! Got {test_result} nodes "
f"when using StatelessProcessGroup!"
)

View File

@@ -0,0 +1,442 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple
import pytest
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test
logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
class ParallelSetup(NamedTuple):
tp_size: int
pp_size: int
eager_mode: bool
class PPTestOptions(NamedTuple):
multi_node_only: bool
load_format: str | None = None
@dataclass
class PPTestSettings:
parallel_setups: list[ParallelSetup]
distributed_backends: list[str]
runner: RunnerOption
test_options: PPTestOptions
@staticmethod
def detailed(
*,
tp_base: int = 1,
pp_base: int = 2,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: str | None = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
],
distributed_backends=["mp", "ray"],
runner=runner,
test_options=PPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
),
)
@staticmethod
def fast(
*,
tp_base: int = 1,
pp_base: int = 2,
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: str | None = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
],
distributed_backends=["mp"],
runner=runner,
test_options=PPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
),
)
def iter_params(self, model_id: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend in self.distributed_backends:
yield (model_id, parallel_setup, backend, self.runner, opts)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
"bigscience/bloomz-1b1": PPTestSettings.fast(),
"zai-org/chatglm3-6b": PPTestSettings.fast(),
"CohereLabs/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
"databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-1.1-2b-it": PPTestSettings.fast(),
"google/gemma-2-9b": PPTestSettings.fast(),
"gpt2": PPTestSettings.fast(),
"bigcode/starcoder": PPTestSettings.fast(),
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
"EleutherAI/pythia-1.4b": PPTestSettings.fast(),
"ibm/PowerLM-3b": PPTestSettings.fast(),
"ibm/PowerMoE-3b": PPTestSettings.fast(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
"pfnet/plamo-2-1b": PPTestSettings.fast(),
"pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(),
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
# Tests TransformersForCausalLM
"hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
"allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
multi_node_only=True, load_format="dummy"
),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
}
EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
load_format="dummy", runner="pooling"
),
}
MULTIMODAL_MODELS = {
# [Decoder-only]
"Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
"facebook/chameleon-7b": PPTestSettings.fast(),
"adept/fuyu-8b": PPTestSettings.fast(),
"zai-org/glm-4v-9b": PPTestSettings.fast(),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
"AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
"AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
"microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
}
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [
# [LANGUAGE GENERATION]
"microsoft/Phi-3.5-MoE-instruct",
"meta-llama/Llama-3.2-1B-Instruct",
"hmellor/Ilama-3.2-1B",
"ibm/PowerLM-3b",
"deepseek-ai/DeepSeek-V2-Lite-Chat",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B",
"microsoft/Phi-3.5-vision-instruct",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
# [LANGUAGE GENERATION - HYBRID ARCH]
"ai21labs/Jamba-tiny-dev",
]
def _compare_tp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available: int,
*,
method: Literal["generate", "encode"],
is_multimodal: bool,
):
(
tp_size,
pp_size,
eager_mode,
) = parallel_setup
multi_node_only, load_format = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
hf_config = get_config(model_id, trust_remote_code)
require_embed_inputs = model_info.require_embed_inputs
max_num_seqs = model_info.max_num_seqs
dtype = "float16"
if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
dtype = "bfloat16"
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_hidden_layers": 4,
"hidden_size": 512,
"intermediate_size": 800,
"num_attention_heads": 4,
"num_key_value_heads": 1,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
common_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
dtype,
"--max-model-len",
"2048",
"--max-num-seqs",
"8",
]
if eager_mode:
common_args.append("--enforce-eager")
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if load_format:
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if require_embed_inputs:
common_args.extend(
[
"--skip-tokenizer-init",
"--enable-prompt-embeds",
"--enable-mm-embeds",
]
)
if max_num_seqs:
common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
if distributed_backend == "ray":
# Test Ray Compiled Graph for all the tests
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp":
pp_env = None
else:
pp_env = None
tp_env = None
pp_args = [
*common_args,
"--pipeline-parallel-size",
str(pp_size),
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
distributed_backend,
]
# compare without pipeline parallelism
# NOTE: use mp backend for TP
# PP tests might involve multiple nodes, and ray might
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
"mp",
]
compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
[
params
for model_id, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id)
if model_id in TEST_MODELS
],
)
@create_new_process_for_each_test()
def test_tp_language_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(
model_id,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="generate",
is_multimodal=False,
)
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
[
params
for model_id, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_id)
if model_id in TEST_MODELS
],
)
@create_new_process_for_each_test()
def test_tp_language_embedding(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(
model_id,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="encode",
is_multimodal=False,
)
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
[
params
for model_id, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_id)
if model_id in TEST_MODELS
],
)
@create_new_process_for_each_test()
def test_tp_multimodal_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(
model_id,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
method="generate",
is_multimodal=True,
)

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
def _verify(partition_str, num_layers, pp_size, goldens):
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
for pp_rank, golden in enumerate(goldens):
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None:
m.setenv("VLLM_PP_LAYER_PARTITION", bak)
# Even partition
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Balanced partition
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
# Put reminder somewhere
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
# Invalid partition strings
with pytest.raises(ValueError):
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
with pytest.raises(ValueError):
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Wrong number of partitions
with pytest.raises(ValueError):
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Wrong number of layers
with pytest.raises(ValueError):
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@pytest.mark.parametrize(
"num_hidden_layers,pp_size,pp_rank,indices",
[
# pp_size 2
(2, 2, 0, (0, 1)),
(2, 2, 1, (1, 2)),
(3, 2, 0, (0, 2)),
(3, 2, 1, (2, 3)),
# pp_size 3
(3, 3, 0, (0, 1)),
(3, 3, 1, (1, 2)),
(3, 3, 2, (2, 3)),
(4, 3, 0, (0, 1)),
(4, 3, 1, (1, 3)),
(4, 3, 2, (3, 4)),
(5, 3, 0, (0, 2)),
(5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)),
],
)
def test_uneven_auto_partition(
num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)

View File

@@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from typing_extensions import LiteralString
from ..utils import compare_two_settings, create_new_process_for_each_test
@pytest.mark.parametrize(
"PP_SIZE, MODEL_NAME",
[
(2, "JackFram/llama-160m"),
],
)
@pytest.mark.parametrize(
"ATTN_BACKEND",
[
"FLASH_ATTN",
],
)
@create_new_process_for_each_test()
def test_pp_cudagraph(
monkeypatch: pytest.MonkeyPatch,
PP_SIZE: int,
MODEL_NAME: str,
ATTN_BACKEND: LiteralString,
):
with monkeypatch.context() as m:
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"mp",
]
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)

View File

@@ -1,30 +1,40 @@
import multiprocessing
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import multiprocess as mp
import numpy as np
import pytest
import torch
import torch.distributed
import vllm.distributed.device_communicators.pymccl_utils as pymccl_utils
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
ncclGetUniqueId)
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
init_distributed_environment, with_pynccl_for_all_reduce)
from vllm.utils import update_environment_variables
ensure_model_parallel_initialized,
get_world_group,
graph_capture,
init_distributed_environment,
)
from vllm.utils.system_utils import update_environment_variables
mp.set_start_method("spawn", force=True)
def distributed_run(fn, world_size):
number_of_processes = world_size
processes = []
processes: list[mp.Process] = []
for i in range(number_of_processes):
env = {}
env['RANK'] = str(i)
env['LOCAL_RANK'] = str(i)
env['WORLD_SIZE'] = str(number_of_processes)
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
env['MASTER_ADDR'] = 'localhost'
env['MASTER_PORT'] = '12345'
p = multiprocessing.Process(target=fn, args=(env, ))
env: dict[str, str] = {}
env["RANK"] = str(i)
env["LOCAL_RANK"] = str(i)
env["WORLD_SIZE"] = str(number_of_processes)
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = mp.Process(target=fn, args=(env,))
processes.append(p)
p.start()
@@ -41,6 +51,9 @@ def worker_fn_wrapper(fn):
# and update the environment variables in the function
def wrapped_fn(env):
update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
init_distributed_environment()
fn()
@@ -49,105 +62,345 @@ def worker_fn_wrapper(fn):
@worker_fn_wrapper
def worker_fn():
comm = NCCLCommunicator()
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
comm.all_reduce(tensor)
result = tensor.mean().cpu().item()
assert result == comm.world_size
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl():
distributed_run(worker_fn, 2)
@worker_fn_wrapper
def multiple_tp_worker_fn():
def multiple_allreduce_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
groups = [
torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
torch.distributed.new_group(ranks=[2, 3], backend="gloo")
torch.distributed.new_group(ranks=[2, 3], backend="gloo"),
]
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
comm = NCCLCommunicator(group=group, device=device)
pynccl_comm = PyNcclCommunicator(group=group, device=device)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
# two groups can communicate independently
if torch.distributed.get_rank() in [0, 1]:
comm.all_reduce(tensor)
comm.all_reduce(tensor)
result = tensor.mean().cpu().item()
assert result == 4
tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == 4).cpu().item()
else:
comm.all_reduce(tensor)
result = tensor.mean().cpu().item()
assert result == 2
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize()
assert torch.all(tensor == 2).cpu().item()
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.")
def test_pynccl_multiple_tp():
@pytest.mark.skipif(
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
)
def test_pynccl_multiple_allreduce():
# this tests pynccl for multiple tp groups, in a standalone way
# i.e. call `comm.all_reduce` directly
distributed_run(multiple_tp_worker_fn, 4)
# i.e. call `pynccl_comm.all_reduce` directly
distributed_run(multiple_allreduce_worker_fn, 4)
@worker_fn_wrapper
def multiple_tp_with_vllm_worker_fn():
def multiple_allreduce_with_vllm_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
torch.cuda.set_device(torch.distributed.get_rank())
ensure_model_parallel_initialized(2, 2)
pymccl_utils.init_process_group(
group=get_tensor_model_parallel_cpu_group())
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
with with_pynccl_for_all_reduce():
with graph_capture(device=device):
# two tp groups can communicate independently
if torch.distributed.get_rank() in [0, 1]:
tensor = tensor_model_parallel_all_reduce(tensor)
tensor = tensor_model_parallel_all_reduce(tensor)
result = tensor.mean().cpu().item()
assert result == 4
torch.cuda.synchronize()
assert torch.all(tensor == 4).cpu().item()
else:
tensor = tensor_model_parallel_all_reduce(tensor)
result = tensor.mean().cpu().item()
assert result == 2
torch.cuda.synchronize()
assert torch.all(tensor == 2).cpu().item()
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.")
def test_pynccl_multiple_tp_with_vllm():
@pytest.mark.skipif(
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
)
def test_pynccl_multiple_allreduce_with_vllm():
# this tests pynccl for multiple tp groups, together with vllm
# i.e. call `tensor_model_parallel_all_reduce`
distributed_run(multiple_tp_with_vllm_worker_fn, 4)
distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
@worker_fn_wrapper
def worker_fn_with_cudagraph():
with torch.no_grad():
graph = torch.cuda.CUDAGraph()
comm = NCCLCommunicator()
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
# run something in the default stream to initialize torch engine
a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
torch.cuda.synchronize()
with torch.cuda.graph(graph):
a_out = pynccl_comm.all_reduce(a)
torch.cuda.synchronize()
with torch.cuda.graph(graph, stream=comm.stream):
# operation during the graph capture is recorded but not executed
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
comm.all_reduce(a)
comm.stream.synchronize()
assert a.mean().cpu().item() == comm.world_size**0
graph.replay()
comm.stream.synchronize()
assert a.mean().cpu().item() == comm.world_size**1
torch.cuda.synchronize()
assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@worker_fn_wrapper
def all_gather_worker_fn():
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
rank = pynccl_comm.rank
world_size = pynccl_comm.world_size
device = f"cuda:{pynccl_comm.rank}"
num_elems = 1000
tensor = (
torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
)
result = torch.zeros(num_elems * world_size, dtype=torch.float32, device=device)
expected = torch.cat(
[
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
for r in range(world_size)
]
).to(device)
pynccl_comm.all_gather(result, tensor)
torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl_all_gather():
distributed_run(all_gather_worker_fn, 2)
@worker_fn_wrapper
def all_gatherv_worker_fn():
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
rank = pynccl_comm.rank
world_size = pynccl_comm.world_size
device = f"cuda:{pynccl_comm.rank}"
assert world_size <= 8
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
num_elems = sizes[rank]
tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
expected = torch.cat(
[
torch.arange(sizes[r], dtype=torch.float32) + r * 100
for r in range(world_size)
]
).to(device)
pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl_all_gatherv():
distributed_run(all_gatherv_worker_fn, 2)
@worker_fn_wrapper
def reduce_scatter_worker_fn():
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
rank = pynccl_comm.rank
world_size = pynccl_comm.world_size
device = f"cuda:{pynccl_comm.rank}"
num_elems = 1000
tensor = (
torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
)
assert num_elems % world_size == 0
result = torch.zeros(num_elems // world_size, dtype=torch.float32, device=device)
# Calculate expected result for this rank's chunk
scattered_size = num_elems // world_size
all_tensors = [
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
for r in range(world_size)
]
expected = sum(
tensor[rank * scattered_size : (rank + 1) * scattered_size]
for tensor in all_tensors
).to(device)
pynccl_comm.reduce_scatter(result, tensor)
torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl_reduce_scatter():
distributed_run(reduce_scatter_worker_fn, 2)
@worker_fn_wrapper
def reduce_scatterv_worker_fn():
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
rank = pynccl_comm.rank
world_size = pynccl_comm.world_size
device = f"cuda:{pynccl_comm.rank}"
assert world_size <= 8
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
num_elems = sum(sizes)
tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
# Calculate expected result for this rank's chunk
all_tensors = [
torch.arange(num_elems, dtype=torch.float32) + r * 100
for r in range(world_size)
]
sizes_cumsum = np.cumsum(sizes)
start = 0 if rank == 0 else sizes_cumsum[rank - 1]
end = sizes_cumsum[rank]
expected = sum(tensor[start:end] for tensor in all_tensors).to(device)
pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes)
torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl_reduce_scatterv():
distributed_run(reduce_scatterv_worker_fn, 2)
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl_with_cudagraph():
distributed_run(worker_fn_with_cudagraph, 2)
@worker_fn_wrapper
def send_recv_worker_fn():
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
if pynccl_comm.rank == 0:
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
else:
tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
if pynccl_comm.rank == 0:
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
else:
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
torch.cuda.synchronize()
assert torch.all(tensor == 1).cpu().item()
@pytest.mark.skipif(
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
)
def test_pynccl_send_recv():
distributed_run(send_recv_worker_fn, 2)
@worker_fn_wrapper
def multiple_send_recv_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
groups = [
torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
torch.distributed.new_group(ranks=[1, 3], backend="gloo"),
]
group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
pynccl_comm = PyNcclCommunicator(group=group, device=device)
if torch.distributed.get_rank() == 0:
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
elif torch.distributed.get_rank() == 1:
tensor = 2 * torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
else:
tensor = torch.empty(16, 1024, 1024, dtype=torch.float32, device=device)
if torch.distributed.get_rank() in [0, 1]:
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
else:
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
torch.cuda.synchronize()
if torch.distributed.get_rank() in [0, 2]:
assert torch.all(tensor == 1).cpu().item()
else:
assert torch.all(tensor == 2).cpu().item()
@pytest.mark.skipif(
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
)
def test_pynccl_multiple_send_recv():
distributed_run(multiple_send_recv_worker_fn, 4)
@pytest.mark.skipif(
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
)
def test_pynccl_broadcast():
distributed_run(broadcast_worker_fn, 4)
@worker_fn_wrapper
def broadcast_worker_fn():
# Test broadcast for every root rank.
# Essentially this is an all-gather operation.
pynccl_comm = PyNcclCommunicator(
get_world_group().cpu_group, device=get_world_group().device
)
recv_tensors = [
torch.empty(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
for i in range(pynccl_comm.world_size)
]
recv_tensors[pynccl_comm.rank] = (
torch.ones(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
* pynccl_comm.rank
)
for i in range(pynccl_comm.world_size):
pynccl_comm.broadcast(recv_tensors[i], src=i)
# the broadcast op might be launched in a different stream
# need to synchronize to make sure the tensor is ready
torch.cuda.synchronize()
assert torch.all(recv_tensors[i] == i).cpu().item()
def test_ncclGetUniqueId():
unique_id = ncclGetUniqueId()
lib = NCCLLibrary()
unique_id = lib.ncclGetUniqueId()
# `list(unique_id.internal)` is something like this:
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

View File

@@ -1,43 +0,0 @@
import multiprocessing
import tempfile
def target_fn(env, filepath):
from vllm.utils import update_environment_variables
update_environment_variables(env)
from vllm.utils import nccl_integrity_check
nccl_integrity_check(filepath)
def test_library_file():
# note: don't import vllm.distributed.device_communicators.pynccl
# before running this test, otherwise the library file will be loaded
# and it might interfere with the test
from vllm.utils import find_nccl_library
so_file = find_nccl_library()
with open(so_file, 'rb') as f:
content = f.read()
try:
# corrupt the library file, should raise an exception
with open(so_file, 'wb') as f:
f.write(content[:len(content) // 2])
p = multiprocessing.Process(target=target_fn, args=({}, so_file))
p.start()
p.join()
assert p.exitcode != 0
# move the library file to a tmp path
# test VLLM_NCCL_SO_PATH
fd, path = tempfile.mkstemp()
with open(path, 'wb') as f:
f.write(content)
p = multiprocessing.Process(target=target_fn,
args=({
"VLLM_NCCL_SO_PATH": path
}, path))
p.start()
p.join()
assert p.exitcode == 0
finally:
with open(so_file, 'wb') as f:
f.write(content)

View File

@@ -0,0 +1,223 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
import random
import pytest
import ray
import torch
import torch.distributed as dist
from vllm import _custom_ops as ops
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
from vllm.distributed.parallel_state import get_tp_group, graph_capture
from vllm.platforms import current_platform
from ..utils import (
ensure_model_parallel_initialized,
init_test_distributed_environment,
multi_process_parallel,
)
torch.manual_seed(42)
random.seed(44)
# Size over 8MB is sufficient for custom quick allreduce.
test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
for i, v in enumerate(test_sizes):
test_sizes[i] -= v % 8
@ray.remote(num_gpus=1, max_calls=1)
def graph_quickreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tp_group().device_group
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily
# (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for
# graph capture immediately.
data = torch.zeros(1)
data = data.to(device=device)
torch.distributed.all_reduce(data, group=group)
torch.cuda.synchronize()
del data
# we use the first group to communicate once
# and the second group to communicate twice
# and so on
# this is used to demonstrate that each group can
# communicate independently
num_communication = rank // tp_size + 1
for sz in test_sizes:
for dtype in [torch.float16, torch.bfloat16]:
with graph_capture(device=device) as graph_capture_context:
inp1 = torch.randint(
1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
inp2 = torch.randint(
-23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
for _ in range(num_communication):
out1 = tensor_model_parallel_all_reduce(inp1)
dist.all_reduce(inp1, group=group)
out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group)
graph.replay()
torch.testing.assert_close(out1, inp1, atol=2.5, rtol=0.1)
torch.testing.assert_close(out2, inp2, atol=2.5, rtol=0.1)
@ray.remote(num_gpus=1, max_calls=1)
def eager_quickreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
# Size over 8MB is sufficient for custom quick allreduce.
sz = 16 * 1024 * 1024
fa = get_tp_group().device_communicator.qr_comm
inp = torch.tensor(
[1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
)
out = fa.quick_all_reduce(inp)
torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
inp = torch.tensor(
[1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
)
out = fa.quick_all_reduce(inp)
torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
@pytest.mark.skipif(
not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
)
@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
def test_custom_quick_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
quant_mode,
):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
def qr_variable_input(rank, world_size):
"""
When the tensor parallelism is set to 4 or 8, frequent changes
in the input shape can cause QuickReduce to hang (this issue
has been observed with the gpt_oss model).
"""
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
qr_max_size = None # MB
_ptr = ops.init_custom_qr(rank, world_size, qr_max_size)
ranks = []
for i in range(world_size):
ranks.append(i)
dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:29500",
rank=rank,
world_size=world_size,
)
cpu_group = torch.distributed.new_group(ranks, backend="nccl")
handle = ops.qr_get_handle(_ptr)
world_size = dist.get_world_size(group=cpu_group)
handles = [None] * world_size
dist.all_gather_object(handles, handle, group=cpu_group)
ops.qr_open_handles(_ptr, handles)
num = 1
s1 = 1024
while num < 50000: # 50000 is sufficient to identify issues.
dtype = torch.float16
if num % 2 == 0:
s2 = 1024
inp1 = torch.zeros(
(s1, s2), dtype=dtype, device=torch.cuda.current_device()
)
else:
s2 = 2048
inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device())
result = torch.empty_like(inp1)
# FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True)
try:
if inp1[0, 0] == 0:
assert torch.all(result == 0)
else:
assert torch.all(result == world_size)
except AssertionError:
print("Assertion failed! Allreduce results are incorrect.")
raise
num += 1
@pytest.mark.skipif(
not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
)
@pytest.mark.parametrize("tp_size", [4, 8])
@pytest.mark.parametrize("pipeline_parallel_size", [1])
def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
multiprocessing.set_start_method("spawn", force=True)
# 60s is enough
timeout = 60
processes = []
for rank in range(tp_size):
p = multiprocessing.Process(target=qr_variable_input, args=(rank, tp_size))
p.start()
processes.append((rank, p))
for rank, p in processes:
p.join(timeout=timeout)
if p.is_alive():
for r, proc in processes:
if proc.is_alive():
proc.terminate()
proc.join()
raise RuntimeError(f"QuickReduce hang detected after {timeout} seconds!")
if __name__ == "__main__":
test_custom_quick_allreduce_variable_input(tp_size=4, pipeline_parallel_size=1)

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import torch
import torch.distributed as dist
from vllm.distributed.parallel_state import in_the_same_node_as
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils.network_utils import get_ip, get_open_port
def _run_test(pg):
test_result = all(in_the_same_node_as(pg, source_rank=0))
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}"
if pg == dist.group.WORLD:
print("Same node test passed! when using torch distributed!")
else:
print("Same node test passed! when using StatelessProcessGroup!")
if __name__ == "__main__":
dist.init_process_group(backend="gloo")
rank = dist.get_rank()
if rank == 0:
port = get_open_port()
ip = get_ip()
dist.broadcast_object_list([ip, port], src=0)
else:
recv = [None, None]
dist.broadcast_object_list(recv, src=0)
ip, port = recv
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
for pg in [dist.group.WORLD, stateless_pg]:
if os.environ.get("VLLM_TEST_WITH_DEFAULT_DEVICE_SET", "0") == "1":
default_devices = ["cpu"]
if torch.cuda.is_available():
default_devices.append("cuda")
for device in default_devices:
torch.set_default_device(device)
_run_test(pg)
else:
_run_test(pg)

View File

@@ -0,0 +1,352 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple
import pytest
from vllm.config.compilation import CompilationMode
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test
logger = init_logger("test_sequence_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
class ParallelSetup(NamedTuple):
tp_size: int
pp_size: int
fuse_norm_quant: bool
fuse_act_quant: bool
eager_mode: bool
chunked_prefill: bool
class SPTestOptions(NamedTuple):
multi_node_only: bool
load_format: str | None = None
@dataclass
class SPTestSettings:
parallel_setups: list[ParallelSetup]
distributed_backends: list[str]
runner: RunnerOption
test_options: SPTestOptions
@staticmethod
def detailed(
*,
tp_base: int = 2,
pp_base: int = 1,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: str | None = None,
):
parallel_setups = []
for eager_mode_val in [False, True]:
for pp_multiplier in [1, 2]:
for chunked_prefill_val in [False, True]:
parallel_setups.append(
ParallelSetup(
tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
fuse_norm_quant=False,
fuse_act_quant=False,
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val,
)
)
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
),
)
@staticmethod
def fast(
*,
tp_base: int = 2,
pp_base: int = 1,
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: str | None = None,
):
parallel_setups = []
for eager_mode_val in [False, True]:
for pp_multiplier in [1, 2]:
for chunked_prefill_val in [False, True]:
parallel_setups.append(
ParallelSetup(
tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
fuse_norm_quant=False,
fuse_act_quant=False,
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val,
)
)
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
),
)
@staticmethod
def fp8_quant(
*,
tp_base: int = 2,
pp_base: int = 1,
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: str | None = None,
):
parallel_setups = []
for fusion_val in [False, True]:
parallel_setups.append(
ParallelSetup(
tp_size=tp_base,
pp_size=pp_base,
fuse_norm_quant=fusion_val,
fuse_act_quant=fusion_val,
eager_mode=True,
chunked_prefill=False,
)
)
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
),
)
def iter_params(self, model_id: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend in self.distributed_backends:
yield (
model_id,
parallel_setup,
backend,
self.runner,
opts,
)
def _compare_sp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available: int,
use_inductor_graph_partition: bool,
fuse_gemm_comms: bool,
*,
method: Literal["generate", "encode"],
is_multimodal: bool,
):
(
tp_size,
pp_size,
fuse_norm_quant,
fuse_act_quant,
eager_mode,
chunked_prefill,
) = parallel_setup
multi_node_only, load_format = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
require_embed_inputs = model_info.require_embed_inputs
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_hidden_layers": 4,
"hidden_size": 512,
"intermediate_size": 800,
"num_attention_heads": 4,
"num_key_value_heads": 1,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
common_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"2048",
"--max-num-seqs",
"8",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if load_format:
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if require_embed_inputs:
common_args.extend(
[
"--skip-tokenizer-init",
"--enable-prompt-embeds",
"--enable-mm-embeds",
]
)
compilation_config = {
"mode": CompilationMode.VLLM_COMPILE,
"compile_sizes": [4, 8],
"pass_config": {
"enable_sp": True,
"fuse_gemm_comms": fuse_gemm_comms,
"fuse_norm_quant": fuse_norm_quant,
"fuse_act_quant": fuse_act_quant,
"eliminate_noops": True,
},
"use_inductor_graph_partition": use_inductor_graph_partition,
}
tp_sp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--distributed-executor-backend",
distributed_backend,
"--compilation_config",
json.dumps(compilation_config),
]
tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--distributed-executor-backend",
"mp",
]
compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
SP_TEXT_GENERATION_MODELS = {
# [Decoder-only]
"hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
}
SP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"hmellor/tiny-random-LlamaForCausalLM",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
]
@pytest.mark.parametrize(
(
"model_id",
"parallel_setup",
"distributed_backend",
"runner",
"test_options",
),
[
params
for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id)
if model_id in SP_TEST_MODELS
],
)
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
@pytest.mark.parametrize("fuse_gemm_comms", [False]) # TODO: enable async TP
@create_new_process_for_each_test()
def test_tp_sp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available,
use_inductor_graph_partition: bool,
fuse_gemm_comms: bool,
):
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
# Skip FP8 SP-only test on sm89 (compute capability 8.9)
if (
"fp8" in model_id.lower()
and current_platform.get_device_capability() < (9, 0)
and (not fuse_gemm_comms)
):
pytest.skip("FP8 reduction support begins with sm90 capable devices.")
_compare_sp(
model_id,
parallel_setup,
distributed_backend,
runner,
test_options,
num_gpus_available,
use_inductor_graph_partition,
fuse_gemm_comms=fuse_gemm_comms,
method="generate",
is_multimodal=False,
)

View File

@@ -0,0 +1,117 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
import random
import time
import numpy as np
import torch.distributed as dist
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils.network_utils import get_open_port
from vllm.utils.system_utils import update_environment_variables
def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
np.random.seed(seed)
sizes = np.random.randint(1, 10_000, n)
# on average, each array will have 5k elements
# with int64, each array will have 40kb
return [np.random.randint(1, 100, i) for i in sizes]
def distributed_run(fn, world_size):
number_of_processes = world_size
processes = []
for i in range(number_of_processes):
env = {}
env["RANK"] = str(i)
env["LOCAL_RANK"] = str(i)
env["WORLD_SIZE"] = str(number_of_processes)
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = multiprocessing.Process(target=fn, args=(env,))
processes.append(p)
p.start()
for p in processes:
p.join()
for p in processes:
assert p.exitcode == 0
def worker_fn_wrapper(fn):
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def wrapped_fn(env):
update_environment_variables(env)
dist.init_process_group(backend="gloo")
fn()
return wrapped_fn
@worker_fn_wrapper
def worker_fn():
rank = dist.get_rank()
if rank == 0:
port = get_open_port()
ip = "127.0.0.1"
dist.broadcast_object_list([ip, port], src=0)
else:
recv = [None, None]
dist.broadcast_object_list(recv, src=0)
ip, port = recv # type: ignore
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
for pg in [dist.group.WORLD, stateless_pg]:
writer_rank = 2
broadcaster = MessageQueue.create_from_process_group(
pg, 40 * 1024, 2, writer_rank
)
if rank == writer_rank:
seed = random.randint(0, 1000)
dist.broadcast_object_list([seed], writer_rank)
else:
recv = [None]
dist.broadcast_object_list(recv, writer_rank)
seed = recv[0] # type: ignore
if pg == dist.group.WORLD:
dist.barrier()
else:
pg.barrier()
# in case we find a race condition
# print the seed so that we can reproduce the error
print(f"Rank {rank} got seed {seed}")
# test broadcasting with about 400MB of data
N = 10_000
if rank == writer_rank:
arrs = get_arrays(N, seed)
for x in arrs:
broadcaster.broadcast_object(x)
time.sleep(random.random() / 1000)
else:
arrs = get_arrays(N, seed)
for x in arrs:
y = broadcaster.broadcast_object(None)
assert np.array_equal(x, y)
time.sleep(random.random() / 1000)
if pg == dist.group.WORLD:
dist.barrier()
print(f"torch distributed passed the test! Rank {rank}")
else:
pg.barrier()
print(f"StatelessProcessGroup passed the test! Rank {rank}")
def test_shm_broadcast():
distributed_run(worker_fn, 4)

View File

@@ -0,0 +1,243 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import traceback
import unittest
import numpy as np
from vllm.distributed.device_communicators.shm_object_storage import (
SingleWriterShmRingBuffer,
)
class TestSingleWriterShmRingBuffer(unittest.TestCase):
"""Test suite for the ring buffer implementation"""
def setUp(self):
"""Set up test fixtures"""
self.buffer_size = 4096
self.ring_buffer = None
def tearDown(self):
"""Clean up after tests"""
if self.ring_buffer:
del self.ring_buffer
def test_buffer_opening(self):
"""Test opening an existing buffer"""
# First create a buffer
self.ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=self.buffer_size, create=True
)
# Then open it with another instance
reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle())
self.assertFalse(reader_buffer.is_writer)
self.assertEqual(
reader_buffer.shared_memory.name, self.ring_buffer.shared_memory.name
)
def test_buffer_access(self):
"""Test accessing allocated buffers"""
self.ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=self.buffer_size, create=True
)
size = 100
address, monotonic_id = self.ring_buffer.allocate_buf(size)
# Write some test data
test_data = b"Hello, World!" * 7 # 91 bytes
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
data_buf[0 : len(test_data)] = test_data
# Read it back
with self.ring_buffer.access_buf(address) as (data_buf2, metadata2):
read_data = bytes(data_buf2[0 : len(test_data)])
read_id = metadata2[0]
self.assertEqual(read_data, test_data)
self.assertEqual(read_id, monotonic_id)
def test_memory_error_on_full_buffer(self):
"""Test that MemoryError is raised when buffer is full"""
small_buffer_size = 200
self.ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=small_buffer_size, create=True
)
# Fill up the buffer
self.ring_buffer.allocate_buf(100)
self.ring_buffer.allocate_buf(80) # Total: 196 bytes used
# This should fail
with self.assertRaises(MemoryError):
self.ring_buffer.allocate_buf(1) # Would exceed buffer capacity
def test_allocation_and_free(self):
"""Test allocation and freeing of buffers"""
small_buffer_size = 200
self.ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=small_buffer_size, create=True
)
size = 80
# Write some data
test_data = b"Repeated test data"
for i in range(5):
address, monotonic_id = self.ring_buffer.allocate_buf(size)
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
data_buf[0:4] = (0).to_bytes(4, "little") # 0 for not in-use
data_buf[4 : len(test_data) + 4] = test_data
print(self.ring_buffer.metadata)
freed_ids = self.ring_buffer.free_buf(lambda *args: True)
print(f" Freed IDs: {freed_ids}")
self.assertEqual(freed_ids[0], i)
def test_clear_buffer(self):
"""Test clearing the buffer"""
self.ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=self.buffer_size, create=True
)
# Allocate some buffers
for _ in range(3):
self.ring_buffer.allocate_buf(100)
# Clear the buffer
self.ring_buffer.clear()
# Check that metadata is empty and IDs reset
self.assertEqual(len(self.ring_buffer.metadata), 0)
self.assertEqual(self.ring_buffer.monotonic_id_start, 0)
self.assertEqual(self.ring_buffer.monotonic_id_end, 0)
self.assertEqual(self.ring_buffer.data_buffer_start, 0)
self.assertEqual(self.ring_buffer.data_buffer_end, 0)
def test_allocation_cycles(self):
buffer_size = 100
ring = SingleWriterShmRingBuffer(data_buffer_size=buffer_size, create=True)
# tracking allocations for assertions
allocated_bitmap = np.zeros(
(buffer_size,), dtype=np.bool_
) # addr -> is_allocated
allocation_map = dict() # monotonic_id -> (addr, size)
def count_allocated(bitmap) -> int:
return np.sum(bitmap).item()
def is_free_fn(a, b) -> bool:
return True
def mark_allocated_with_assertion(id, addr, size):
addr = addr % buffer_size
self.assertEqual(count_allocated(allocated_bitmap[addr : addr + size]), 0)
allocated_bitmap[addr : addr + size] = True
allocation_map[id] = (addr, size)
def mark_freed_with_assertion(id):
self.assertTrue(id in allocation_map)
addr, size = allocation_map.pop(id)
addr = addr % buffer_size
self.assertEqual(
count_allocated(allocated_bitmap[addr : addr + size]), size
)
allocated_bitmap[addr : addr + size] = False
def ring_free(free_size=None):
freed_ids = ring.free_buf(is_free_fn, free_size)
for freed_id in freed_ids:
mark_freed_with_assertion(freed_id)
def ring_allocate(allocate_size):
allocate_size_with_md = allocate_size + ring.MD_SIZE
try:
addr, monotonic_id = ring.allocate_buf(allocate_size)
mark_allocated_with_assertion(monotonic_id, addr, allocate_size_with_md)
except MemoryError:
# free 2x size for enough space if wrapping happened
ring_free(allocate_size_with_md * 2)
# retry allocating
addr, monotonic_id = ring.allocate_buf(allocate_size)
mark_allocated_with_assertion(monotonic_id, addr, allocate_size_with_md)
# 1. allocation & free cycles
for _ in range(33):
# will consume 2 + 8 = 10 bytes per allocation
ring_allocate(2)
# 2. free all allocations
ring_free()
# 3. try allocate the largest possible buffer
ring_allocate(buffer_size - ring.MD_SIZE)
def main():
"""Main function demonstrating usage and running tests"""
print("=== SingleWriterShmRingBuffer Test Suite ===\n")
# Run unit tests
print("Running unit tests...")
unittest.main(argv=[""], exit=False, verbosity=2)
print("\n" + "=" * 50)
print("=== Manual Demo ===\n")
# Manual demonstration
try:
print("Creating ring buffer...")
writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048, create=True)
reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle())
print(f"Buffer created with name: {writer_buffer.shared_memory.name}")
# Allocate some buffers
print("\nAllocating buffers...")
address_array = []
for i in range(3):
size = 100 + i * 50
try:
writer_buffer.free_buf(lambda *args: True)
address, monotonic_id = writer_buffer.allocate_buf(size)
address_array.append((address, size, monotonic_id))
# Write some test data
with writer_buffer.access_buf(address) as (data_buf, metadata):
test_message = f"Test message {i}".encode()
data_buf[0 : len(test_message)] = test_message
except MemoryError as e:
print(f" Failed to allocate {size} bytes: {e}")
print("\nBuffer state:")
print(f" Data buffer start: {writer_buffer.data_buffer_start}")
print(f" Data buffer end: {writer_buffer.data_buffer_end}")
print(f" Monotonic ID start: {writer_buffer.monotonic_id_start}")
print(f" Monotonic ID end: {writer_buffer.monotonic_id_end}")
print(f" Metadata entries: {len(writer_buffer.metadata)}")
# Try to read back the data
print("\nReading back data...")
for address, size, monotonic_id in address_array:
with reader_buffer.access_buf(address) as (data_buf, metadata):
# Find null terminator or read first 50 chars
data_bytes = bytes(data_buf[0:size])
message = data_bytes.decode()
print(f" ID {monotonic_id}: '{message}'")
except Exception as e:
print(f"Demo error: {e}")
traceback.print_exc()
print("\n=== Demo Complete ===")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,327 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
import random
import time
import traceback
import unittest
from multiprocessing import Lock
import torch
# Assuming these are imported from your module
from vllm.distributed.device_communicators.shm_object_storage import (
MsgpackSerde,
SingleWriterShmObjectStorage,
SingleWriterShmRingBuffer,
)
from vllm.multimodal.inputs import (
MultiModalFieldElem,
MultiModalKwargsItem,
MultiModalSharedField,
)
def _dummy_elem(modality: str, key: str, size: int):
return MultiModalFieldElem(
modality=modality,
key=key,
data=torch.empty((size,), dtype=torch.int8),
field=MultiModalSharedField(batch_size=1),
)
def _dummy_item(modality: str, size_by_key: dict[str, int]):
return MultiModalKwargsItem.from_elems(
[_dummy_elem(modality, key, size) for key, size in size_by_key.items()]
)
class TestSingleWriterShmObjectStorage(unittest.TestCase):
def setUp(self):
"""Set up test fixtures before each test method."""
ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=1024 * 100,
create=True, # 10 MB buffer
)
self.storage = SingleWriterShmObjectStorage(
max_object_size=1024 * 10, # 10KB max object
n_readers=2,
ring_buffer=ring_buffer,
serde_class=MsgpackSerde,
reader_lock=Lock(),
)
def tearDown(self):
"""Clean up after each test."""
if self.storage:
del self.storage
def test_minimal_put_get_cycle(self):
"""Test basic put and get operations."""
key = "test_key"
value = _dummy_item("text", {"field1": 10, "field2": 20})
# Put operation
address, monotonic_id = self.storage.put(key, value)
# Verify key is in index
self.assertIn(key, self.storage.key_index)
self.assertEqual(self.storage.key_index[key], (address, monotonic_id))
self.assertEqual(self.storage.id_index[monotonic_id], key)
# Get operation
result = self.storage.get(address, monotonic_id)
# Verify result
self.assertEqual(result, value)
def test_put_same_key_twice(self):
"""Test behavior when putting the same key multiple times."""
key = "duplicate_key"
value1 = "first value"
value2 = "second value"
# First put
address1, id1 = self.storage.put(key, value1)
retrieved1 = self.storage.get(address1, id1)
self.assertEqual(retrieved1, value1)
# should raise an error on second put
with self.assertRaises(ValueError) as context:
self.storage.put(key, value2)
self.assertIn("already exists in the storage", str(context.exception))
def test_large_object_rejection(self):
"""Test that objects exceeding max_object_size are rejected."""
# Create an object larger than max_object_size
large_data = "x" * (self.storage.max_object_size + 100)
with self.assertRaises(ValueError) as context:
self.storage.put("large_key", large_data)
self.assertIn("exceeds max object size", str(context.exception))
def test_buffer_overflow_and_cleanup(self):
"""Test behavior when buffer fills up and needs cleanup."""
# Fill up the buffer with many small objects
stored_items = []
try:
for i in range(1000): # Try to store many items
key = f"item_{i}"
value = f"data_{i}" * 100 # Make it reasonably sized
address, monotonic_id = self.storage.put(key, value)
stored_items.append((key, value, address, monotonic_id))
except MemoryError:
print(f"Buffer filled after {len(stored_items)} items")
# Verify that some items are still accessible
accessible_count = 0
for key, original_value, address, monotonic_id in stored_items:
for i in range(self.storage.n_readers):
retrieved = self.storage.get(address, monotonic_id)
if retrieved == original_value:
accessible_count += 1
self.assertEqual(accessible_count, len(stored_items))
try:
for i in range(len(stored_items), 1000): # Try to store many items
key = f"item_{i}"
value = f"data_{i}" * 100 # Make it reasonably sized
address, monotonic_id = self.storage.put(key, value)
stored_items.append((key, value, address, monotonic_id))
except MemoryError:
print(f"Buffer filled after {len(stored_items)} items")
# Verify that some items are still accessibles
for key, original_value, address, monotonic_id in stored_items:
try:
for i in range(self.storage.n_readers):
retrieved = self.storage.get(address, monotonic_id)
if retrieved == original_value:
accessible_count += 1
except ValueError as e:
print(f"Error retrieving {key}: {e}")
# some items from the first batch may still be accessible
self.assertGreaterEqual(accessible_count, len(stored_items))
def test_blocking_unread_object(self):
"""Test behavior when buffer fills up and needs cleanup."""
# Fill up the buffer with many small objects
stored_items = []
try:
for i in range(1000): # Try to store many items
key = f"item_{i}"
value = f"data_{i}" * 100 # Make it reasonably sized
address, monotonic_id = self.storage.put(key, value)
stored_items.append((key, value, address, monotonic_id))
except MemoryError:
print(f"Buffer filled after {len(stored_items)} items")
# read all items except the first one
# to simulate a blocking situation
accessible_count = 0
for key, original_value, address, monotonic_id in stored_items[1:]:
for i in range(self.storage.n_readers):
retrieved = self.storage.get(address, monotonic_id)
if retrieved == original_value:
accessible_count += 1
self.assertEqual(accessible_count, len(stored_items) - 1)
try:
key = f"item_{len(stored_items)}"
value = f"data_{len(stored_items)}" * 100
address, monotonic_id = self.storage.put(key, value)
except MemoryError:
print(f"Buffer filled after {len(stored_items)} items")
# read the first item
for i in range(self.storage.n_readers):
key, original_value, address, monotonic_id = stored_items[0]
retrieved = self.storage.get(address, monotonic_id)
self.assertEqual(retrieved, original_value)
try:
for i in range(len(stored_items), 1000): # Try to store many items
key = f"item_{i}"
value = f"data_{i}" * 100 # Make it reasonably sized
address, monotonic_id = self.storage.put(key, value)
stored_items.append((key, value, address, monotonic_id))
except MemoryError:
print(f"Buffer filled after {len(stored_items)} items")
# some items from the first batch may still be accessible
self.assertGreaterEqual(len(stored_items), accessible_count + 10)
def test_invalid_get_operations(self):
"""Test various invalid get operations."""
# Test with non-existent address
with self.assertRaises(ValueError): # Could be various exceptions
self.storage.get(99999, 1)
# Store something first
address, monotonic_id = self.storage.put("test", "value")
# Test with wrong monotonic_id
with self.assertRaises(ValueError) as context:
self.storage.get(address, monotonic_id + 100)
self.assertIn("has been modified or is invalid", str(context.exception))
def test_clear_storage(self):
"""Test clearing the storage."""
# Store some items
for i in range(5):
self.storage.put(f"item_{i}", f"value_{i}")
# Clear the storage
self.storage.clear()
# Verify that all indices are empty
self.assertEqual(len(self.storage.key_index), 0)
self.assertEqual(len(self.storage.id_index), 0)
self.assertEqual(len(self.storage.ring_buffer.metadata), 0)
# Verify that new items can be added after clearing
address, monotonic_id = self.storage.put("new_item", "new_value")
self.assertIn("new_item", self.storage.key_index)
self.assertEqual((address, monotonic_id), (0, 0))
# Reader process function
def reader_process(process_id, storage_handle, items_to_read):
"""Reader process that connects to existing shared memory and reads data."""
reader_storage = SingleWriterShmObjectStorage.create_from_handle(storage_handle)
print(f"Reader {process_id} started")
errors = []
for key, original_value, address, monotonic_id in items_to_read:
time.sleep(random.random() / 100)
try:
# Read data from shared memory
retrieved_value = reader_storage.get(address, monotonic_id)
# Verify data integrity
assert retrieved_value == original_value
print(f"Reader {process_id} retrieved {key}: {retrieved_value}")
except Exception as e:
errors.append((key, str(e), type(e).__name__))
def run_multiprocess_example():
"""Run a minimal working example with real shared memory."""
print("=== Minimal Object Storage Example ===")
try:
# Create storage instance
ring_buffer = SingleWriterShmRingBuffer(
data_buffer_size=1024 * 100,
create=True, # 10 MB buffer
)
storage = SingleWriterShmObjectStorage(
max_object_size=1024,
n_readers=3,
ring_buffer=ring_buffer,
serde_class=MsgpackSerde,
reader_lock=Lock(),
)
print(f"Created storage (writer: {storage.is_writer})")
# Test basic data types
test_data = [
("user_data", {"name": "Alice", "age": 30, "scores": [95, 87, 92]}),
("simple_string", "Hello, World!"),
("number", 42),
("list_data", [1, 2, 3, "four", 5.0]),
]
stored_items = []
# Store all data
for key, value in test_data:
print(f"Storing {key}: {value}")
address, monotonic_id = storage.put(key, value)
stored_items.append((key, value, address, monotonic_id))
print(f" -> Stored at address {address}, ID {monotonic_id}")
print("\n--- Retrieving Data ---")
processes = []
handle = storage.handle()
# initialize lock for reader processes
handle.reader_lock = Lock()
for i in range(storage.n_readers):
p = multiprocessing.Process(
target=reader_process, args=(i, handle, stored_items)
)
processes.append(p)
p.start()
for p in processes:
p.join(timeout=10)
if p.is_alive():
p.terminate()
p.join()
except Exception as e:
print(f"Error in minimal example: {e}")
traceback.print_exc()
if __name__ == "__main__":
# Run the minimal example first
run_multiprocess_example()
print("\n" + "=" * 50 + "\n")
# Run the test suite
print("Running comprehensive test suite...")
unittest.main(verbosity=2, exit=False)

View File

@@ -0,0 +1,140 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import queue
import random
import typing
import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import vllm.envs as envs
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
from vllm.distributed.parallel_state import (
get_tp_group,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
torch.manual_seed(42)
random.seed(44)
test_size_elements = 1024 * 1024
def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
monkeypatch = pytest.MonkeyPatch()
config = VllmConfig(parallel_config=ParallelConfig(tensor_parallel_size=world_size))
with monkeypatch.context() as m, set_current_vllm_config(config):
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
dtype = torch.bfloat16
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
update_environment_variables(
{
"RANK": str(local_rank),
"LOCAL_RANK": str(local_rank),
"WORLD_SIZE": str(world_size),
"MASTER_ADDR": "localhost",
"MASTER_PORT": "12345",
}
)
init_distributed_environment()
initialize_model_parallel(tensor_model_parallel_size=world_size)
cuda_communicator = typing.cast(
CudaCommunicator, get_tp_group().device_communicator
)
symm_mem_comm = cuda_communicator.symm_mem_comm
if symm_mem_comm is None or symm_mem_comm.disabled:
# can't use skip under multiprocessing
q.put("SymmMemCommunicator is not available or disabled.")
return
inp_direct_symm_mem = torch.randint(
1, 23, (test_size_elements,), dtype=dtype, device=device
)
if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
# can't use skip under multiprocessing
q.put("SymmMemCommunicator isn't used for this world and input size.")
return
original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
out_direct_symm_mem = symm_mem_comm.all_reduce(inp_direct_symm_mem)
assert out_direct_symm_mem is not None
group = get_tp_group().device_group
dist.all_reduce(original_inp_direct_symm_mem, group=group)
torch.testing.assert_close(
out_direct_symm_mem, original_inp_direct_symm_mem, atol=2.5, rtol=0.1
)
# Test tensor_model_parallel_all_reduce which should use symm_mem
inp_tensor_parallel = torch.randint(
-23, 1, (test_size_elements,), dtype=dtype, device=device
)
original_inp_tensor_parallel = inp_tensor_parallel.clone()
out_tensor_parallel = tensor_model_parallel_all_reduce(inp_tensor_parallel)
dist.all_reduce(original_inp_tensor_parallel, group=group)
torch.testing.assert_close(
out_tensor_parallel, original_inp_tensor_parallel, atol=2.5, rtol=0.1
)
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="SymmMemAllreduce is only available for CUDA platforms.",
)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
def test_symm_mem_allreduce(
monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
q = mp.get_context("spawn").Queue()
mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
try:
val = q.get(timeout=1)
except queue.Empty:
val = None
finally:
cleanup_dist_env_and_memory()
if val is not None:
pytest.skip(val)
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="SymmMemAllreduce is only available for CUDA platforms.",
)
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
world_size = 4
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
# Verify that the DataParallel runs without error
engine_args = EngineArgs(
model="distilbert/distilgpt2",
enforce_eager=True,
enable_prefix_caching=True,
data_parallel_size=2,
tensor_parallel_size=2,
data_parallel_backend="mp",
)
LLMEngine.from_engine_args(engine_args)

View File

@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# unit test for `examples/offline_inference/torchrun_example.py`
import os
import random
import torch.distributed as dist
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import get_world_group
dist.init_process_group(backend="gloo")
# Create prompts
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="facebook/opt-125m",
tensor_parallel_size=2,
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)
outputs = llm.generate(prompts, sampling_params)
cpu_group = get_world_group().cpu_group
torch_rank = dist.get_rank(group=cpu_group)
def test_consistent_across_ranks(obj):
if torch_rank == 0:
dist.broadcast_object_list([obj], src=0, group=cpu_group)
else:
container = [None]
dist.broadcast_object_list(container, src=0, group=cpu_group)
assert container[0] == obj
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
# make sure we can access the model parameters from the calling process
# of the `LLM` instance.
params = list(
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
)
test_consistent_across_ranks(len(params))
# all ranks should have the same outputs
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
test_consistent_across_ranks(prompt)
test_consistent_across_ranks(generated_text)
print(f"Rank {torch_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# unit test for `examples/offline_inference/torchrun_example.py`
import os
import random
import torch.distributed as dist
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import get_tp_group, get_world_group
dist.init_process_group(backend="gloo")
# Create prompts
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
dp_size = int(os.getenv("DP_SIZE", "1"))
dp_rank = int(os.getenv("DP_RANK", "0"))
if dp_size > 1:
# distribute the prompts across the data parallel ranks
prompts = [prompt for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(
model="microsoft/Phi-mini-MoE-instruct",
tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4),
seed=0,
)
outputs = llm.generate(prompts, sampling_params)
group = get_world_group() if dp_size == 1 else get_tp_group()
cpu_group = group.cpu_group
group_rank = dist.get_rank(group=cpu_group)
def test_consistent_across_ranks(obj):
if group_rank == 0:
dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
else:
container = [None]
dist.broadcast_object_list(container, src=group.ranks[0], group=cpu_group)
assert container[0] == obj
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
# make sure we can access the model parameters from the calling process
# of the `LLM` instance.
params = list(
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
)
test_consistent_across_ranks(len(params))
# all ranks should have the same outputs
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
test_consistent_across_ranks(prompt)
test_consistent_across_ranks(generated_text)
print(f"Rank {group_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import socket
import pytest
import ray
import torch
import vllm.envs as envs
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils.network_utils import get_open_port
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import cuda_device_count_stateless
from ..utils import multi_gpu_test
@ray.remote
class _CUDADeviceCountStatelessTestActor:
def get_count(self):
return cuda_device_count_stateless()
def set_cuda_visible_devices(self, cuda_visible_devices: str):
update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
def get_cuda_visible_devices(self):
return envs.CUDA_VISIBLE_DEVICES
def test_cuda_device_count_stateless():
"""Test that cuda_device_count_stateless changes return value if
CUDA_VISIBLE_DEVICES is changed."""
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
num_gpus=2
).remote()
assert len(sorted(ray.get(actor.get_cuda_visible_devices.remote()).split(","))) == 2
assert ray.get(actor.get_count.remote()) == 2
ray.get(actor.set_cuda_visible_devices.remote("0"))
assert ray.get(actor.get_count.remote()) == 1
ray.get(actor.set_cuda_visible_devices.remote(""))
assert ray.get(actor.get_count.remote()) == 0
def cpu_worker(rank, WORLD_SIZE, port1, port2):
pg1 = StatelessProcessGroup.create(
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
)
if rank <= 2:
pg2 = StatelessProcessGroup.create(
host="127.0.0.1", port=port2, rank=rank, world_size=3
)
data = torch.tensor([rank])
data = pg1.broadcast_obj(data, src=2)
assert data.item() == 2
if rank <= 2:
data = torch.tensor([rank + 1])
data = pg2.broadcast_obj(data, src=2)
assert data.item() == 3
pg2.barrier()
pg1.barrier()
def gpu_worker(rank, WORLD_SIZE, port1, port2):
torch.cuda.set_device(rank)
pg1 = StatelessProcessGroup.create(
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
)
pynccl1 = PyNcclCommunicator(pg1, device=rank)
if rank <= 2:
pg2 = StatelessProcessGroup.create(
host="127.0.0.1", port=port2, rank=rank, world_size=3
)
pynccl2 = PyNcclCommunicator(pg2, device=rank)
data = torch.tensor([rank]).cuda()
pynccl1.all_reduce(data)
pg1.barrier()
torch.cuda.synchronize()
if rank <= 2:
pynccl2.all_reduce(data)
pg2.barrier()
torch.cuda.synchronize()
item = data[0].item()
print(f"rank: {rank}, item: {item}")
if rank == 3:
assert item == 6
else:
assert item == 18
def broadcast_worker(rank, WORLD_SIZE, port1, port2):
pg1 = StatelessProcessGroup.create(
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
)
if rank == 2:
pg1.broadcast_obj("secret", src=2)
else:
obj = pg1.broadcast_obj(None, src=2)
assert obj == "secret"
pg1.barrier()
def allgather_worker(rank, WORLD_SIZE, port1, port2):
pg1 = StatelessProcessGroup.create(
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
)
data = pg1.all_gather_obj(rank)
assert data == list(range(WORLD_SIZE))
pg1.barrier()
@pytest.mark.skip(reason="This test is flaky and prone to hang.")
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize(
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]
)
def test_stateless_process_group(worker):
port1 = get_open_port()
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", port1))
port2 = get_open_port()
WORLD_SIZE = 4
from multiprocessing import get_context
ctx = get_context("fork")
processes = []
for i in range(WORLD_SIZE):
rank = i
processes.append(
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))
)
for p in processes:
p.start()
for p in processes:
p.join()
for p in processes:
assert not p.exitcode
print("All processes finished.")