Sync from v0.13
This commit is contained in:
0
tests/distributed/__init__.py
Normal file
0
tests/distributed/__init__.py
Normal file
168
tests/distributed/conftest.py
Normal file
168
tests/distributed/conftest.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
|
||||
import msgspec
|
||||
import msgspec.msgpack
|
||||
import pytest
|
||||
import zmq
|
||||
|
||||
from vllm.config.kv_events import KVEventsConfig
|
||||
from vllm.distributed.kv_events import EventPublisherFactory
|
||||
|
||||
from .test_events import SampleBatch
|
||||
|
||||
DP_RANK = 0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def random_port():
|
||||
"""Generate a random port number for testing"""
|
||||
return random.randint(10000, 59900)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def publisher_config(random_port, request):
|
||||
"""Create a publisher config with inproc transport"""
|
||||
how = request.param if hasattr(request, "param") else "inproc"
|
||||
|
||||
if how == "inproc":
|
||||
endpoint = f"inproc://test-{random_port}"
|
||||
replay_endpoint = endpoint + "-replay"
|
||||
else:
|
||||
endpoint = f"tcp://*:{random_port}"
|
||||
replay_endpoint = f"tcp://*:{random_port + 100}"
|
||||
|
||||
return KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
publisher="zmq",
|
||||
endpoint=endpoint,
|
||||
replay_endpoint=replay_endpoint,
|
||||
buffer_steps=100,
|
||||
hwm=1000,
|
||||
topic="test",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def publisher(publisher_config):
|
||||
"""Create and return a publisher instance"""
|
||||
pub = EventPublisherFactory.create(publisher_config, DP_RANK)
|
||||
yield pub
|
||||
pub.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def subscriber(publisher_config):
|
||||
"""Create and return a subscriber for testing"""
|
||||
endpoint = publisher_config.endpoint
|
||||
replay_endpoint = publisher_config.replay_endpoint
|
||||
|
||||
if endpoint.startswith("tcp://*"):
|
||||
endpoint = endpoint.replace("*", "127.0.0.1")
|
||||
if replay_endpoint and replay_endpoint.startswith("tcp://*"):
|
||||
replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
|
||||
|
||||
sub = MockSubscriber(
|
||||
[endpoint],
|
||||
[replay_endpoint] if replay_endpoint else None,
|
||||
publisher_config.topic,
|
||||
)
|
||||
yield sub
|
||||
sub.close()
|
||||
|
||||
|
||||
class MockSubscriber:
|
||||
"""Helper class to receive and verify published events"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pub_endpoints: str | list[str],
|
||||
replay_endpoints: str | list[str] | None = None,
|
||||
topic: str = "",
|
||||
decode_type=SampleBatch,
|
||||
):
|
||||
self.ctx = zmq.Context.instance()
|
||||
|
||||
# Convert single endpoint to list for consistency
|
||||
if isinstance(pub_endpoints, str):
|
||||
pub_endpoints = [pub_endpoints]
|
||||
if isinstance(replay_endpoints, str):
|
||||
replay_endpoints = [replay_endpoints]
|
||||
|
||||
# Set up subscriber socket - connect to all endpoints
|
||||
self.sub = self.ctx.socket(zmq.SUB)
|
||||
self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8"))
|
||||
for endpoint in pub_endpoints:
|
||||
self.sub.connect(endpoint)
|
||||
|
||||
# Set up replay sockets if provided
|
||||
self.replay_sockets = []
|
||||
if replay_endpoints:
|
||||
for replay_endpoint in replay_endpoints:
|
||||
replay = self.ctx.socket(zmq.REQ)
|
||||
replay.connect(replay_endpoint)
|
||||
self.replay_sockets.append(replay)
|
||||
|
||||
self.topic = topic
|
||||
self.topic_bytes = topic.encode("utf-8")
|
||||
self.received_msgs: list[tuple[int, SampleBatch]] = []
|
||||
self.last_seq = -1
|
||||
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
|
||||
|
||||
def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
|
||||
"""Receive a single message with timeout"""
|
||||
if not self.sub.poll(timeout):
|
||||
return None
|
||||
|
||||
topic_bytes, seq_bytes, payload = self.sub.recv_multipart()
|
||||
assert topic_bytes == self.topic_bytes
|
||||
|
||||
seq = int.from_bytes(seq_bytes, "big")
|
||||
data = self.decoder.decode(payload)
|
||||
self.last_seq = seq
|
||||
self.received_msgs.append((seq, data))
|
||||
return seq, data
|
||||
|
||||
def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
|
||||
"""Request replay of messages starting from start_seq"""
|
||||
if not self.replay_sockets:
|
||||
raise ValueError("Replay sockets not initialized")
|
||||
if socket_idx >= len(self.replay_sockets):
|
||||
raise ValueError(f"Invalid socket index {socket_idx}")
|
||||
|
||||
self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
|
||||
|
||||
def receive_replay(self, socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
|
||||
"""Receive replayed messages from a specific replay socket"""
|
||||
if not self.replay_sockets:
|
||||
raise ValueError("Replay sockets not initialized")
|
||||
if socket_idx >= len(self.replay_sockets):
|
||||
raise ValueError(f"Invalid socket index {socket_idx}")
|
||||
|
||||
replay_socket = self.replay_sockets[socket_idx]
|
||||
replayed: list[tuple[int, SampleBatch]] = []
|
||||
while True:
|
||||
try:
|
||||
if not replay_socket.poll(1000):
|
||||
break
|
||||
|
||||
frames = replay_socket.recv_multipart()
|
||||
if not frames or not frames[-1]:
|
||||
# End of replay marker
|
||||
break
|
||||
|
||||
seq_bytes, payload = frames
|
||||
seq = int.from_bytes(seq_bytes, "big")
|
||||
data = self.decoder.decode(payload)
|
||||
replayed.append((seq, data))
|
||||
except zmq.ZMQError as _:
|
||||
break
|
||||
|
||||
return replayed
|
||||
|
||||
def close(self):
|
||||
"""Clean up resources"""
|
||||
self.sub.close()
|
||||
for replay in self.replay_sockets:
|
||||
replay.close()
|
||||
49
tests/distributed/eplb_utils.py
Normal file
49
tests/distributed/eplb_utils.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from vllm.distributed.parallel_state import (
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
|
||||
def distributed_run(fn, world_size, *args):
|
||||
number_of_processes = world_size
|
||||
processes: list[mp.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: dict[str, str] = {}
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = mp.Process(target=fn, args=(env, world_size, *args))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def set_env_vars_and_device(env: dict[str, str]) -> None:
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
|
||||
# Ensure each worker process has the same random seed
|
||||
random.seed(42)
|
||||
torch.manual_seed(42)
|
||||
@@ -1,59 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
vLLM will allocate all the available memory, so we need to run the tests one
|
||||
by one. The solution is to pass arguments (model name) by environment
|
||||
variables.
|
||||
Run:
|
||||
```sh
|
||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
||||
test_basic_distributed_correctness.py
|
||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
||||
test_basic_distributed_correctness.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
MODELS = [
|
||||
os.environ["TEST_DIST_MODEL"],
|
||||
]
|
||||
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
enforce_eager = False
|
||||
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
|
||||
if backend_by_env_var == "FLASHINFER":
|
||||
enforce_eager = True
|
||||
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
enforce_eager=enforce_eager)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
64
tests/distributed/test_ca_buffer_sharing.py
Normal file
64
tests/distributed/test_ca_buffer_sharing.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# can only run on machines with p2p access across GPUs
|
||||
# can only run with torchrun:
|
||||
# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
|
||||
|
||||
import ctypes
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa
|
||||
CustomAllreduce,
|
||||
)
|
||||
|
||||
# create a cpu process group for communicating metadata (ipc handle)
|
||||
dist.init_process_group(backend="gloo")
|
||||
rank = local_rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
|
||||
# every process sets its own device (differently)
|
||||
lib = CudaRTLibrary()
|
||||
lib.cudaSetDevice(rank)
|
||||
|
||||
buffer_size_in_bytes = 1024
|
||||
byte_value = 2 # the value we write to the buffer for verification
|
||||
|
||||
pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
|
||||
|
||||
print(f"Rank {rank} has pointers {pointers}")
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
if rank == 0:
|
||||
# the first rank tries to write to all buffers
|
||||
for p in pointers:
|
||||
pointer = ctypes.c_void_p(p)
|
||||
lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
host_data = (ctypes.c_char * buffer_size_in_bytes)()
|
||||
|
||||
# all ranks read from all buffers, and check if the data is correct
|
||||
for p in pointers:
|
||||
pointer = ctypes.c_void_p(p)
|
||||
lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
|
||||
for i in range(buffer_size_in_bytes):
|
||||
assert ord(host_data[i]) == byte_value, (
|
||||
f"Rank {rank} failed"
|
||||
f" to verify buffer {p}. Expected {byte_value}, "
|
||||
f"got {ord(host_data[i])}"
|
||||
)
|
||||
|
||||
print(f"Rank {rank} verified all buffers")
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
CustomAllreduce.free_shared_buffer(pointers)
|
||||
@@ -1,66 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
vLLM will allocate all the available memory, so we need to run the tests one
|
||||
by one. The solution is to pass arguments (model name) by environment
|
||||
variables.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
||||
test_chunked_prefill_distributed.py
|
||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
||||
test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
MODELS = [
|
||||
os.environ["TEST_DIST_MODEL"],
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
) -> None:
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
@@ -1,53 +1,105 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test the communication operators.
|
||||
|
||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
from vllm.distributed import (broadcast_tensor_dict,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.test_utils import (init_test_distributed_environment,
|
||||
multi_process_tensor_parallel)
|
||||
from vllm.distributed import (
|
||||
broadcast_tensor_dict,
|
||||
get_pp_group,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_reduce_scatter,
|
||||
)
|
||||
|
||||
from ..utils import (
|
||||
init_test_distributed_environment,
|
||||
multi_gpu_test,
|
||||
multi_process_parallel,
|
||||
)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def all_reduce_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
|
||||
(r + 1) for r in range(tensor_parallel_size)
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
t = all_tensors[rank]
|
||||
t = all_tensors[rank % tp_size]
|
||||
t = tensor_model_parallel_all_reduce(t)
|
||||
assert torch.allclose(t, expected)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def reduce_scatter_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
|
||||
index = rank % tp_size
|
||||
partition_size = num_elements // tp_size
|
||||
all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
expected = all_reduce[index * partition_size : (index + 1) * partition_size]
|
||||
t = all_tensors[index]
|
||||
t = tensor_model_parallel_reduce_scatter(t, 0)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_gather_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
num_dimensions = 3
|
||||
tensor_size = list(range(2, num_dimensions + 2))
|
||||
total_size = 1
|
||||
@@ -55,56 +107,169 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
||||
total_size *= s
|
||||
for all_gather_dimension in range(num_dimensions):
|
||||
all_tensors = [
|
||||
torch.arange(total_size, dtype=torch.float32,
|
||||
device="cuda").reshape(tensor_size) * (r + 1)
|
||||
for r in range(tensor_parallel_size)
|
||||
torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
|
||||
tensor_size
|
||||
)
|
||||
* (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
expected = torch.cat(all_tensors, dim=all_gather_dimension)
|
||||
t = all_tensors[rank]
|
||||
t = all_tensors[rank % tp_size]
|
||||
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
|
||||
assert torch.allclose(t, expected)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def broadcast_tensor_dict_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cuda"),
|
||||
# CPU tensor
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
"e": {"a": 1, "b": 2},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
|
||||
if rank == 0:
|
||||
if (rank % tp_size) == 0:
|
||||
broadcast_tensor_dict(test_dict, src=0)
|
||||
else:
|
||||
recv_dict = broadcast_tensor_dict(src=0)
|
||||
assert len(recv_dict) == len(test_dict)
|
||||
assert torch.allclose(recv_dict["a"], test_dict["a"])
|
||||
assert torch.allclose(recv_dict["b"], test_dict["b"])
|
||||
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
|
||||
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
|
||||
assert recv_dict["c"] == test_dict["c"]
|
||||
assert recv_dict["d"] == test_dict["d"]
|
||||
assert recv_dict["e"] == test_dict["e"]
|
||||
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
|
||||
multi_process_tensor_parallel(tensor_parallel_size, test_target)
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_tensor_dict_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
# CPU tensor
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {"a": 1, "b": 2},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
recv_dict = get_pp_group().recv_tensor_dict()
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
get_pp_group().send_tensor_dict(test_dict)
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
assert len(recv_dict) == len(test_dict)
|
||||
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
|
||||
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
|
||||
assert recv_dict["c"] == test_dict["c"]
|
||||
assert recv_dict["d"] == test_dict["d"]
|
||||
assert recv_dict["e"] == test_dict["e"]
|
||||
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
size = 64
|
||||
test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
get_pp_group().send(test_tensor)
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
torch.testing.assert_close(test_tensor, recv_tensor)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target",
|
||||
[all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
|
||||
)
|
||||
def test_multi_process_tensor_parallel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
test_target: Callable[..., Any],
|
||||
):
|
||||
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
|
||||
)
|
||||
def test_multi_process_pipeline_parallel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
pp_size: int,
|
||||
test_target: Callable[..., Any],
|
||||
):
|
||||
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target",
|
||||
[
|
||||
send_recv_test_worker,
|
||||
send_recv_tensor_dict_test_worker,
|
||||
all_reduce_test_worker,
|
||||
all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker,
|
||||
],
|
||||
)
|
||||
def test_multi_process_tensor_parallel_pipeline_parallel(
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
test_target: Callable[..., Any],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
|
||||
|
||||
296
tests/distributed/test_context_parallel.py
Normal file
296
tests/distributed/test_context_parallel.py
Normal file
@@ -0,0 +1,296 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
|
||||
important to set the distributed backend to "mp" to avoid Ray scheduling
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
|
||||
from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
|
||||
logger = init_logger("test_context_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
]
|
||||
|
||||
# GSM8K eval configuration
|
||||
NUM_QUESTIONS = 256 # Fast eval for CI
|
||||
NUM_SHOTS = 5 # Few-shot examples
|
||||
# tp accuracy with 2% buffer
|
||||
MIN_ACCURACY = {
|
||||
# .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
|
||||
# .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
|
||||
"Qwen/Qwen2.5-1.5B-Instruct": 0.52,
|
||||
}
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
dcp_size: int
|
||||
cp_kv_cache_interleave_size: int
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class CPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
attn_backend: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CPTestSettings:
|
||||
parallel_setups: list[ParallelSetup]
|
||||
distributed_backends: list[str]
|
||||
runner: RunnerOption
|
||||
test_options: CPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 4,
|
||||
pp_base: int = 1,
|
||||
dcp_multipliers: list[float] | None = None,
|
||||
cp_kv_cache_interleave_size: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
attn_backend: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
if dcp_multipliers is None:
|
||||
dcp_multipliers = [
|
||||
0.5,
|
||||
]
|
||||
for eager_mode_val in [False]:
|
||||
for pp_multiplier in [1]:
|
||||
for dcp_multiplier in dcp_multipliers:
|
||||
for chunked_prefill_val in [True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
dcp_size=int(dcp_multiplier * tp_base),
|
||||
cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val,
|
||||
)
|
||||
)
|
||||
return CPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp"],
|
||||
runner=runner,
|
||||
test_options=CPTestOptions(
|
||||
multi_node_only=multi_node_only,
|
||||
attn_backend=attn_backend,
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for backend in self.distributed_backends:
|
||||
yield (
|
||||
model_id,
|
||||
parallel_setup,
|
||||
backend,
|
||||
self.runner,
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
|
||||
CPTestSettings.detailed(dcp_multipliers=[1]),
|
||||
CPTestSettings.detailed(
|
||||
dcp_multipliers=[0.5],
|
||||
cp_kv_cache_interleave_size=64,
|
||||
attn_backend="FLASHMLA",
|
||||
),
|
||||
],
|
||||
"Qwen/Qwen2.5-1.5B-Instruct": [
|
||||
CPTestSettings.detailed(
|
||||
cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
|
||||
),
|
||||
CPTestSettings.detailed(
|
||||
cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
|
||||
),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _test_cp_gsm8k(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: CPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate"],
|
||||
is_multimodal: bool,
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
pp_size,
|
||||
dcp_size,
|
||||
cp_kv_cache_interleave_size,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
|
||||
multi_node_only, attn_backend = test_options
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip(
|
||||
"Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend"
|
||||
)
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
server_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
]
|
||||
if chunked_prefill:
|
||||
server_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
server_args.append("--enforce-eager")
|
||||
if runner != "auto":
|
||||
server_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
server_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
server_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if hf_overrides:
|
||||
server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
server_args.extend(
|
||||
[
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--decode-context-parallel-size",
|
||||
str(dcp_size),
|
||||
"--dcp-kv-cache-interleave-size",
|
||||
str(cp_kv_cache_interleave_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
)
|
||||
|
||||
server_env = {}
|
||||
if attn_backend:
|
||||
server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
model_id,
|
||||
server_args,
|
||||
env_dict=server_env,
|
||||
max_wait_seconds=720,
|
||||
) as remote_server:
|
||||
host = f"http://{remote_server.host}"
|
||||
port = remote_server.port
|
||||
|
||||
# Run GSM8K evaluation
|
||||
results = evaluate_gsm8k(
|
||||
num_questions=NUM_QUESTIONS,
|
||||
num_shots=NUM_SHOTS,
|
||||
host=host,
|
||||
port=port,
|
||||
)
|
||||
|
||||
# Validate accuracy is reasonable
|
||||
accuracy = results["accuracy"]
|
||||
min_accuracy = MIN_ACCURACY[model_id]
|
||||
assert accuracy >= min_accuracy, (
|
||||
f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
(
|
||||
"model_id",
|
||||
"parallel_setup",
|
||||
"distributed_backend",
|
||||
"runner",
|
||||
"test_options",
|
||||
),
|
||||
[
|
||||
params
|
||||
for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
|
||||
for setting in settings
|
||||
for params in setting.iter_params(model_id)
|
||||
if model_id in CP_TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_cp_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: CPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
if (
|
||||
model_id == "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
and torch.cuda.get_device_capability() < (9, 0)
|
||||
):
|
||||
pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
|
||||
if (
|
||||
model_id == "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
and torch.cuda.get_device_capability() != (9, 0)
|
||||
):
|
||||
pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
|
||||
|
||||
_test_cp_gsm8k(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
@@ -1,4 +1,6 @@
|
||||
import os
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
@@ -6,10 +8,14 @@ import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.device_communicators import custom_all_reduce
|
||||
from vllm.test_utils import (init_test_distributed_environment,
|
||||
multi_process_tensor_parallel)
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.parallel_state import get_tp_group, graph_capture
|
||||
|
||||
from ..utils import (
|
||||
ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment,
|
||||
multi_process_parallel,
|
||||
)
|
||||
|
||||
random.seed(42)
|
||||
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
|
||||
@@ -18,67 +24,109 @@ for i, v in enumerate(test_sizes):
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def graph_allreduce(world_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, world_size, rank,
|
||||
distributed_init_port)
|
||||
def graph_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pp_size,
|
||||
rank,
|
||||
distributed_init_port,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tp_group().device_group
|
||||
|
||||
custom_all_reduce.init_custom_all_reduce()
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with custom_all_reduce.capture():
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
# synchronization
|
||||
dist.all_reduce(inp1)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2)
|
||||
graph.replay()
|
||||
assert torch.allclose(out1, inp1)
|
||||
assert torch.allclose(out2, inp2)
|
||||
# A small all_reduce for warmup.
|
||||
# this is needed because device communicators might be created lazily
|
||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||
# before any communication happens, so that this group can be used for
|
||||
# graph capture immediately.
|
||||
data = torch.zeros(1)
|
||||
data = data.to(device=device)
|
||||
torch.distributed.all_reduce(data, group=group)
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with graph_capture(device=device) as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(
|
||||
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
inp2 = torch.randint(
|
||||
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
|
||||
for i in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
# synchronization
|
||||
dist.all_reduce(inp1, group=group)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2, group=group)
|
||||
graph.replay()
|
||||
torch.testing.assert_close(out1, inp1)
|
||||
torch.testing.assert_close(out2, inp2)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def eager_allreduce(world_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, world_size, rank,
|
||||
distributed_init_port)
|
||||
def eager_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pp_size,
|
||||
rank,
|
||||
distributed_init_port,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
sz = 1024
|
||||
custom_all_reduce.init_custom_all_reduce()
|
||||
fa = custom_all_reduce.get_handle()
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = fa.all_reduce_unreg(inp)
|
||||
assert torch.allclose(out, inp * world_size)
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
sz = 1024
|
||||
fa = get_tp_group().device_communicator.ca_comm
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||
out = fa.all_reduce_unreg(inp)
|
||||
assert torch.allclose(out, inp * world_size)
|
||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [2])
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
||||
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
|
||||
multi_process_tensor_parallel(tensor_parallel_size, test_target)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
multi_process_tensor_parallel(2, graph_allreduce)
|
||||
def test_custom_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pipeline_parallel_size,
|
||||
test_target,
|
||||
):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
|
||||
|
||||
8
tests/distributed/test_distributed_oot.py
Normal file
8
tests/distributed/test_distributed_oot.py
Normal file
@@ -0,0 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
|
||||
|
||||
|
||||
def test_distributed_oot(dummy_opt_path: str):
|
||||
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
|
||||
312
tests/distributed/test_eplb_algo.py
Normal file
312
tests/distributed/test_eplb_algo.py
Normal file
@@ -0,0 +1,312 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
|
||||
|
||||
|
||||
def test_basic_rebalance():
|
||||
"""Test basic rebalancing functionality"""
|
||||
# Example from https://github.com/deepseek-ai/eplb
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
|
||||
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
|
||||
]
|
||||
)
|
||||
|
||||
num_layers = weight.shape[0]
|
||||
num_replicas = 16
|
||||
num_groups = 4
|
||||
num_nodes = 2
|
||||
num_gpus = 8
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify output shapes
|
||||
assert phy2log.shape == (
|
||||
2,
|
||||
16,
|
||||
), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
|
||||
assert log2phy.shape[0] == 2, (
|
||||
f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
|
||||
)
|
||||
assert log2phy.shape[1] == 12, (
|
||||
f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
|
||||
)
|
||||
assert logcnt.shape == (
|
||||
2,
|
||||
12,
|
||||
), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
|
||||
|
||||
# Verify physical to logical expert mapping range is correct
|
||||
assert torch.all(phy2log >= 0) and torch.all(phy2log < 12), (
|
||||
"Physical to logical mapping should be in range [0, 12)"
|
||||
)
|
||||
|
||||
# Verify expert count reasonableness
|
||||
assert torch.all(logcnt >= 1), "Each logical expert should have at least 1 replica"
|
||||
assert torch.sum(logcnt, dim=1).sum() == num_replicas * num_layers, (
|
||||
f"Total replicas should be {num_replicas * num_layers}"
|
||||
)
|
||||
|
||||
# Verify expected output
|
||||
expected_phy2log = torch.tensor(
|
||||
[
|
||||
[5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
|
||||
[7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
|
||||
]
|
||||
)
|
||||
assert torch.all(phy2log == expected_phy2log)
|
||||
|
||||
expected_logcnt = torch.tensor(
|
||||
[[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1], [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]]
|
||||
)
|
||||
assert torch.all(logcnt == expected_logcnt)
|
||||
|
||||
|
||||
def test_single_gpu_case():
|
||||
"""Test single GPU case"""
|
||||
weight = torch.tensor([[10, 20, 30, 40]])
|
||||
num_replicas = 4
|
||||
num_groups = 1
|
||||
num_nodes = 1
|
||||
num_gpus = 1
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (1, 4)
|
||||
assert log2phy.shape[0] == 1
|
||||
assert log2phy.shape[1] == 4
|
||||
assert logcnt.shape == (1, 4)
|
||||
|
||||
# Verify all logical experts are mapped
|
||||
assert set(phy2log[0].tolist()) == {0, 1, 2, 3}
|
||||
|
||||
|
||||
def test_equal_weights():
|
||||
"""Test case with equal weights"""
|
||||
weight = torch.tensor([[50, 50, 50, 50, 50, 50, 50, 50]])
|
||||
num_replicas = 8
|
||||
num_groups = 2
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (1, 8)
|
||||
assert logcnt.shape == (1, 8)
|
||||
|
||||
# With equal weights, each expert should have exactly one replica
|
||||
assert torch.all(logcnt == 1), (
|
||||
"With equal weights and no replication, "
|
||||
"each expert should have exactly 1 replica"
|
||||
)
|
||||
|
||||
|
||||
def test_extreme_weight_imbalance():
|
||||
"""Test extreme weight imbalance case"""
|
||||
weight = torch.tensor([[1000, 1, 1, 1, 1, 1, 1, 1]])
|
||||
num_replicas = 12
|
||||
num_groups = 2
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (1, 12)
|
||||
assert logcnt.shape == (1, 8)
|
||||
|
||||
# Expert with highest weight (index 0) should have more replicas
|
||||
assert logcnt[0, 0] > logcnt[0, 1], (
|
||||
"Expert with highest weight should have more replicas"
|
||||
)
|
||||
|
||||
|
||||
def test_multiple_layers():
|
||||
"""Test multiple layers case"""
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[10, 20, 30, 40, 50, 60], # First layer
|
||||
[60, 50, 40, 30, 20, 10], # Second layer (opposite weight pattern)
|
||||
[25, 25, 25, 25, 25, 25], # Third layer (equal weights)
|
||||
]
|
||||
)
|
||||
num_replicas = 8
|
||||
num_groups = 2
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (3, 8)
|
||||
assert logcnt.shape == (3, 6)
|
||||
|
||||
# Verify expert allocation is reasonable for each layer
|
||||
for layer in range(3):
|
||||
assert torch.all(phy2log[layer] >= 0) and torch.all(phy2log[layer] < 6), (
|
||||
f"Layer {layer} physical to logical mappingshould be in range [0, 6)"
|
||||
)
|
||||
assert torch.sum(logcnt[layer]) == num_replicas, (
|
||||
f"Layer {layer} total replicas should be {num_replicas}"
|
||||
)
|
||||
|
||||
|
||||
def test_parameter_validation():
|
||||
"""Test parameter validation"""
|
||||
weight = torch.tensor([[10, 20, 30, 40]])
|
||||
|
||||
# Test non-divisible case - this should handle normally without throwing
|
||||
# errors because the function will fall back to global load balancing
|
||||
# strategy
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
|
||||
assert phy2log.shape == (1, 8)
|
||||
assert logcnt.shape == (1, 4)
|
||||
|
||||
# Test cases that will actually cause errors:
|
||||
# num_physical_experts not divisible by num_gpus
|
||||
with pytest.raises(AssertionError):
|
||||
DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4
|
||||
|
||||
|
||||
def test_small_scale_hierarchical():
|
||||
"""Test small-scale hierarchical load balancing"""
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[100, 50, 200, 75, 150, 25, 300, 80], # 8 experts
|
||||
]
|
||||
)
|
||||
num_replicas = 12
|
||||
num_groups = 4 # 4 groups, 2 experts each
|
||||
num_nodes = 2 # 2 nodes
|
||||
num_gpus = 4 # 4 GPUs
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify basic constraints
|
||||
assert phy2log.shape == (1, 12)
|
||||
assert logcnt.shape == (1, 8)
|
||||
assert torch.sum(logcnt) == num_replicas
|
||||
assert torch.all(logcnt >= 1)
|
||||
|
||||
# Expert with highest weight should have more replicas
|
||||
max_weight_expert = torch.argmax(weight[0])
|
||||
assert logcnt[0, max_weight_expert] >= 2, (
|
||||
"Highest weight expert should have multiple replicas"
|
||||
)
|
||||
|
||||
|
||||
def test_global_load_balance_fallback():
|
||||
"""Test global load balancing fallback case"""
|
||||
# When num_groups % num_nodes != 0, should fall back to global load
|
||||
# balancing
|
||||
weight = torch.tensor([[10, 20, 30, 40, 50, 60]])
|
||||
num_replicas = 8
|
||||
num_groups = 3 # Cannot be divided evenly by num_nodes=2
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Should work normally, just using global load balancing strategy
|
||||
assert phy2log.shape == (1, 8)
|
||||
assert logcnt.shape == (1, 6)
|
||||
assert torch.sum(logcnt) == num_replicas
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
def test_device_compatibility(device):
|
||||
"""Test device compatibility"""
|
||||
if device == "cuda" and not torch.cuda.is_available():
|
||||
pytest.skip("CUDA not available")
|
||||
|
||||
weight = torch.tensor([[10, 20, 30, 40]], device=device)
|
||||
num_replicas = 6
|
||||
num_groups = 2
|
||||
num_nodes = 1
|
||||
num_gpus = 2
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Function will convert to CPU internally, but should handle different
|
||||
# device inputs normally
|
||||
assert phy2log.shape == (1, 6)
|
||||
assert logcnt.shape == (1, 4)
|
||||
|
||||
|
||||
def test_additional_cases():
|
||||
"""Test more edge cases and different parameter combinations"""
|
||||
|
||||
# Test case 1: Large-scale distributed setup
|
||||
weight1 = torch.tensor(
|
||||
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
|
||||
)
|
||||
phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
|
||||
weight1, 24, 8, 4, 8
|
||||
)
|
||||
|
||||
assert phy2log1.shape == (1, 24)
|
||||
assert logcnt1.shape == (1, 16)
|
||||
assert torch.sum(logcnt1) == 24
|
||||
|
||||
# Test case 2: Different weight distributions
|
||||
weight2 = torch.tensor(
|
||||
[
|
||||
[200, 150, 100, 50, 25, 12], # Decreasing weights
|
||||
[12, 25, 50, 100, 150, 200], # Increasing weights
|
||||
]
|
||||
)
|
||||
phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
|
||||
weight2, 10, 3, 1, 2
|
||||
)
|
||||
|
||||
assert phy2log2.shape == (2, 10)
|
||||
assert logcnt2.shape == (2, 6)
|
||||
|
||||
# Verify high-weight experts have more replicas
|
||||
for layer in range(2):
|
||||
max_weight_idx = torch.argmax(weight2[layer])
|
||||
assert logcnt2[layer, max_weight_idx] >= 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
|
||||
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
|
||||
]
|
||||
)
|
||||
|
||||
num_replicas = 16
|
||||
num_groups = 4
|
||||
num_nodes = 2
|
||||
num_gpus = 8
|
||||
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
print(phy2log)
|
||||
|
||||
test_basic_rebalance()
|
||||
607
tests/distributed/test_eplb_execute.py
Normal file
607
tests/distributed/test_eplb_execute.py
Normal file
@@ -0,0 +1,607 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.distributed.eplb.rebalance_execute import (
|
||||
move_from_buffer,
|
||||
rearrange_expert_weights_inplace,
|
||||
transfer_layer,
|
||||
)
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
)
|
||||
|
||||
from .eplb_utils import distributed_run, set_env_vars_and_device
|
||||
|
||||
|
||||
def create_expert_indices_with_redundancy(
|
||||
num_layers: int,
|
||||
num_logical_experts: int,
|
||||
total_physical_experts: int,
|
||||
redundancy_config: list[int], # redundancy for each logical expert
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create expert indices with redundancy.
|
||||
|
||||
Args:
|
||||
num_layers: number of layers
|
||||
num_logical_experts: number of logical experts
|
||||
total_physical_experts: total number of physical experts
|
||||
redundancy_config: redundancy for each logical expert
|
||||
|
||||
Returns:
|
||||
indices: Shape (num_layers, total_physical_experts)
|
||||
"""
|
||||
assert sum(redundancy_config) == total_physical_experts
|
||||
assert len(redundancy_config) == num_logical_experts
|
||||
|
||||
indices = torch.zeros(num_layers, total_physical_experts, dtype=torch.long)
|
||||
|
||||
for layer in range(num_layers):
|
||||
physical_pos = 0
|
||||
for logical_expert_id, redundancy in enumerate(redundancy_config):
|
||||
for _ in range(redundancy):
|
||||
indices[layer, physical_pos] = logical_expert_id
|
||||
physical_pos += 1
|
||||
|
||||
# Shuffle the indices at dim 1
|
||||
for layer in range(num_layers):
|
||||
indices[layer] = indices[layer][torch.randperm(indices.shape[1])]
|
||||
|
||||
return indices
|
||||
|
||||
|
||||
def create_expert_weights(
|
||||
num_layers: int,
|
||||
num_local_experts: int,
|
||||
hidden_sizes: list[int],
|
||||
rank: int,
|
||||
device: torch.device,
|
||||
physical_to_logical_mapping: torch.Tensor,
|
||||
) -> list[list[torch.Tensor]]:
|
||||
"""
|
||||
Create fake expert weights tensor for testing.
|
||||
|
||||
Use `arange` to generate predictable weights values, based on logical
|
||||
expert ID.
|
||||
All replicas of the same logical expert should have the same weights.
|
||||
|
||||
Args:
|
||||
physical_to_logical_mapping: Shape (num_layers, num_local_experts)
|
||||
mapping[layer, physical_pos] = logical_expert_id
|
||||
"""
|
||||
expert_weights = []
|
||||
|
||||
for layer in range(num_layers):
|
||||
layer_weights = []
|
||||
for weight_idx, hidden_size in enumerate(hidden_sizes):
|
||||
weight_tensor = torch.zeros(
|
||||
num_local_experts, hidden_size, device=device, dtype=torch.float32
|
||||
)
|
||||
|
||||
for local_expert in range(num_local_experts):
|
||||
# Get the logical expert ID for this physical expert
|
||||
global_pos = rank * num_local_experts + local_expert
|
||||
logical_expert_id = physical_to_logical_mapping[
|
||||
layer, global_pos
|
||||
].item()
|
||||
|
||||
# Generate weights based on logical expert ID
|
||||
# (so that all replicas of the same logical expert have the
|
||||
# same weights)
|
||||
base_value = logical_expert_id * 1000 + layer * 100 + weight_idx * 10
|
||||
weight_tensor[local_expert] = torch.arange(
|
||||
base_value,
|
||||
base_value + hidden_size,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
layer_weights.append(weight_tensor)
|
||||
expert_weights.append(layer_weights)
|
||||
|
||||
return expert_weights
|
||||
|
||||
|
||||
def create_redundancy_config(
|
||||
num_logical_experts: int,
|
||||
num_physical_experts: int,
|
||||
) -> list[int]:
|
||||
"""Create a redundancy configuration."""
|
||||
redundancy_config = [1] * num_logical_experts
|
||||
remaining = num_physical_experts - num_logical_experts
|
||||
# Randomly assign the remaining physical experts to the logical experts
|
||||
for _ in range(remaining):
|
||||
redundancy_config[random.choice(range(num_logical_experts))] += 1
|
||||
return redundancy_config
|
||||
|
||||
|
||||
def verify_expert_weights_after_shuffle(
|
||||
expert_weights: list[list[torch.Tensor]],
|
||||
new_indices: torch.Tensor,
|
||||
hidden_sizes: list[int],
|
||||
ep_rank: int,
|
||||
num_local_experts: int,
|
||||
):
|
||||
"""Verify the weights after shuffling are correct."""
|
||||
num_layers = len(expert_weights)
|
||||
|
||||
for layer in range(num_layers):
|
||||
for weight_idx, hidden_size in enumerate(hidden_sizes):
|
||||
weight_tensor = expert_weights[layer][weight_idx]
|
||||
|
||||
for local_expert in range(num_local_experts):
|
||||
# Calculate the global expert ID for this local expert
|
||||
global_pos = ep_rank * num_local_experts + local_expert
|
||||
expected_logical_expert = new_indices[layer, global_pos].item()
|
||||
|
||||
# Check if the weights are correct
|
||||
actual_weights = weight_tensor[local_expert]
|
||||
expected_base = (
|
||||
expected_logical_expert * 1000 + layer * 100 + weight_idx * 10
|
||||
)
|
||||
expected_weights = torch.arange(
|
||||
expected_base,
|
||||
expected_base + hidden_size,
|
||||
device=actual_weights.device,
|
||||
dtype=actual_weights.dtype,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(
|
||||
actual_weights,
|
||||
expected_weights,
|
||||
msg=f"Layer {layer}, weight {weight_idx},"
|
||||
f"local expert {local_expert}: "
|
||||
f"weights do not match. "
|
||||
f"Expected logical expert {expected_logical_expert}",
|
||||
)
|
||||
|
||||
|
||||
def verify_redundant_experts_have_same_weights(
|
||||
expert_weights: list[list[torch.Tensor]],
|
||||
indices: torch.Tensor,
|
||||
hidden_sizes: list[int],
|
||||
world_size: int,
|
||||
num_local_experts: int,
|
||||
):
|
||||
"""
|
||||
Verify that all replicas of the same logical expert have the same weights.
|
||||
"""
|
||||
num_layers = len(expert_weights)
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
|
||||
for layer in range(num_layers):
|
||||
# Collect weights for all physical experts for each weight matrix
|
||||
all_weights: list[torch.Tensor] = []
|
||||
|
||||
for weight_idx, hidden_size in enumerate(hidden_sizes):
|
||||
# Create tensor to store all expert weights
|
||||
# Shape: [total_physical_experts, hidden_size]
|
||||
gathered_weights = torch.zeros(
|
||||
total_physical_experts,
|
||||
hidden_size,
|
||||
device=expert_weights[layer][weight_idx].device,
|
||||
dtype=expert_weights[layer][weight_idx].dtype,
|
||||
)
|
||||
|
||||
# Use all_gather to collect expert weights from current node
|
||||
# expert_weights[layer][weight_idx] shape:
|
||||
# [num_local_experts, hidden_size]
|
||||
local_weights = expert_weights[layer][
|
||||
weight_idx
|
||||
] # [num_local_experts, hidden_size]
|
||||
|
||||
# Split tensor along dim 0 into a list for all_gather
|
||||
gathered_weights_list = torch.chunk(gathered_weights, world_size, dim=0)
|
||||
|
||||
torch.distributed.all_gather(
|
||||
# Output list: each element corresponds to one rank's weights
|
||||
list(gathered_weights_list),
|
||||
local_weights, # Input: current rank's local weights
|
||||
)
|
||||
|
||||
all_weights.append(gathered_weights)
|
||||
|
||||
# Verify that all replicas of the same logical expert have the same
|
||||
# weights
|
||||
logical_expert_weights: dict[int, dict[int, torch.Tensor]] = {}
|
||||
|
||||
for physical_pos in range(total_physical_experts):
|
||||
logical_expert_id = int(indices[layer, physical_pos].item())
|
||||
|
||||
if logical_expert_id not in logical_expert_weights:
|
||||
# First time encountering this logical expert, save its weights
|
||||
logical_expert_weights[logical_expert_id] = {
|
||||
weight_idx: all_weights[weight_idx][physical_pos]
|
||||
for weight_idx in range(len(hidden_sizes))
|
||||
}
|
||||
else:
|
||||
# Verify that current physical expert's weights match the
|
||||
# previously saved logical expert weights
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
all_weights[weight_idx][physical_pos],
|
||||
logical_expert_weights[logical_expert_id][weight_idx],
|
||||
msg=f"Layer {layer}, weight {weight_idx},"
|
||||
f"logical expert {logical_expert_id}: "
|
||||
f"Physical expert {physical_pos} has different weights"
|
||||
f"than expected",
|
||||
)
|
||||
|
||||
|
||||
def _test_async_transfer_layer_without_mtp_worker(
|
||||
env,
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_local_experts: int,
|
||||
num_logical_experts: int,
|
||||
) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
tp_group = get_tp_group()
|
||||
ep_group = tp_group.device_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [16, 32]
|
||||
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
device,
|
||||
old_indices,
|
||||
)
|
||||
|
||||
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
|
||||
cuda_stream = torch.cuda.Stream(device=device)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
|
||||
transfer_layer(
|
||||
old_global_expert_indices=old_indices,
|
||||
new_global_expert_indices=new_indices,
|
||||
expert_weights=expert_weights,
|
||||
expert_weights_buffer=expert_buffer,
|
||||
ep_group=ep_group,
|
||||
layer=layer_idx,
|
||||
cuda_stream=cuda_stream,
|
||||
)
|
||||
)
|
||||
|
||||
cuda_stream.synchronize()
|
||||
move_from_buffer(
|
||||
expert_weights=expert_weights[layer_idx],
|
||||
expert_weights_buffer=expert_buffer,
|
||||
is_unchanged=is_unchanged,
|
||||
is_received_locally=is_received_locally,
|
||||
experts_recv_loc=experts_recv_loc,
|
||||
new_indices=new_indices[layer_idx].tolist(),
|
||||
ep_group=ep_group,
|
||||
)
|
||||
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
|
||||
def _test_rearrange_expert_weights_with_redundancy(
|
||||
env, world_size, num_layers, num_local_experts, num_logical_experts
|
||||
) -> None:
|
||||
# Initialize model parallel (using tensor parallel as an entrypoint
|
||||
# to expert parallel)
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
# Test parameters
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [32, 64] # Two different weight matrices
|
||||
|
||||
# Create old expert indices (with redundancy)
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
|
||||
# Create new expert indices (with redundancy)
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
|
||||
# Create expert weights
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
|
||||
# Execute weight rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
old_indices,
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
# Verify the rearrangement result
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"world_size,num_layers,num_local_experts,num_logical_experts",
|
||||
[
|
||||
# 2 GPU, 2 experts per GPU
|
||||
# 3 logical experts, 4 physical experts, 1 redundant experts
|
||||
(2, 1, 2, 3),
|
||||
# 2 GPU, 3 experts per GPU
|
||||
# 4 logical experts, 6 physical experts, 2 redundant experts
|
||||
(2, 2, 3, 4),
|
||||
# 2 GPU, 8 experts per GPU
|
||||
# 16 logical experts, 16 physical experts, 0 redundant experts
|
||||
(2, 4, 8, 16),
|
||||
# 4 GPU, 2 experts per GPU
|
||||
# 6 logical experts, 8 physical experts, 2 redundant experts
|
||||
(4, 1, 2, 6),
|
||||
# 4 GPU, 2 experts per GPU
|
||||
# 5 logical experts, 8 physical experts, 3 redundant experts
|
||||
(4, 2, 2, 5),
|
||||
# 4 GPU, 8 experts per GPU
|
||||
# 16 logical experts, 32 physical experts, 16 redundant experts
|
||||
(4, 8, 8, 16),
|
||||
],
|
||||
)
|
||||
def test_rearrange_expert_weights_with_redundancy(
|
||||
world_size, num_layers, num_local_experts, num_logical_experts
|
||||
):
|
||||
"""Test the functionality of rearranging expert weights with redundancy."""
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
distributed_run(
|
||||
_test_rearrange_expert_weights_with_redundancy,
|
||||
world_size,
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
num_logical_experts,
|
||||
)
|
||||
|
||||
|
||||
def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
num_layers = 2
|
||||
num_local_experts = 2
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
num_logical_experts = total_physical_experts // 2 # Some redundancy
|
||||
hidden_sizes = [32, 64]
|
||||
|
||||
# Create redundancy configuration
|
||||
redundancy_config = [2] * num_logical_experts
|
||||
|
||||
# Same indices - no change
|
||||
indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, redundancy_config
|
||||
)
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
|
||||
)
|
||||
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
for layer_weights in expert_weights:
|
||||
layer_copy = []
|
||||
for weight in layer_weights:
|
||||
layer_copy.append(weight.clone())
|
||||
original_weights.append(layer_copy)
|
||||
|
||||
# Execute rearrangement (should be no change)
|
||||
rearrange_expert_weights_inplace(
|
||||
indices,
|
||||
indices, # Same indices
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
# Verify that the weights have not changed
|
||||
for layer in range(num_layers):
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg=f"""Layer {layer}, weight {weight_idx}
|
||||
should remain unchanged""",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"world_size,num_layers,num_local_experts,num_logical_experts",
|
||||
[
|
||||
(2, 2, 2, 3),
|
||||
],
|
||||
)
|
||||
def test_async_transfer_layer_without_mtp(
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_local_experts: int,
|
||||
num_logical_experts: int,
|
||||
):
|
||||
"""Exercise async EPLB transfer path without MTP/spec decode."""
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
|
||||
distributed_run(
|
||||
_test_async_transfer_layer_without_mtp_worker,
|
||||
world_size,
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
num_logical_experts,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
def test_rearrange_expert_weights_no_change(world_size):
|
||||
"""
|
||||
Test that when the indices do not change, the weights should remain
|
||||
unchanged.
|
||||
"""
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
distributed_run(_test_rearrange_expert_weights_no_change, world_size)
|
||||
|
||||
|
||||
def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
num_layers = 1
|
||||
num_local_experts = 2
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
num_logical_experts = total_physical_experts // 2
|
||||
hidden_sizes = [32]
|
||||
|
||||
# Create different index distributions
|
||||
old_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, old_redundancy
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, new_redundancy
|
||||
)
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
for layer_weights in expert_weights:
|
||||
layer_copy = []
|
||||
for weight in layer_weights:
|
||||
layer_copy.append(weight.clone())
|
||||
original_weights.append(layer_copy)
|
||||
|
||||
# Execute profile mode rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
old_indices,
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=True, # Profile mode
|
||||
)
|
||||
|
||||
# In profile mode, the weights should remain unchanged
|
||||
for layer in range(num_layers):
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg="In profile mode, the weights should remain unchanged",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
def test_rearrange_expert_weights_profile_mode(world_size):
|
||||
"""Test profile mode (should not copy actual weights)"""
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
|
||||
285
tests/distributed/test_eplb_fused_moe_layer.py
Normal file
285
tests/distributed/test_eplb_fused_moe_layer.py
Normal file
@@ -0,0 +1,285 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Test that the interaction between EPLB and FusedMoE Layer is okay
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
|
||||
from .eplb_utils import distributed_run, set_env_vars_and_device
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestConfig:
|
||||
num_layers: int
|
||||
num_experts: int
|
||||
num_local_experts: int
|
||||
num_topk: int
|
||||
hidden_size: int
|
||||
intermediate_size: int
|
||||
weight_dtype: torch.dtype
|
||||
weight_scale_dtype: torch.dtype | None
|
||||
column_major_scales: bool
|
||||
|
||||
|
||||
def make_expert_weights(
|
||||
layer_idx: int,
|
||||
global_expert_idx: int,
|
||||
global_num_experts: int,
|
||||
tensor_shape: tuple[int, ...],
|
||||
tensor_dtype: torch.dtype,
|
||||
tensor_device: torch.device,
|
||||
is_column_major: bool,
|
||||
) -> torch.Tensor:
|
||||
assert len(tensor_shape) == 2
|
||||
|
||||
if is_column_major:
|
||||
tensor_shape = (tensor_shape[1], tensor_shape[0])
|
||||
|
||||
x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
|
||||
value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
|
||||
x.view(-1).copy_(
|
||||
torch.arange(
|
||||
value_offset,
|
||||
value_offset + x.numel(),
|
||||
dtype=tensor_dtype,
|
||||
device=tensor_device,
|
||||
)
|
||||
)
|
||||
|
||||
if is_column_major:
|
||||
x = torch.transpose(x, 1, 0)
|
||||
assert not x.is_contiguous()
|
||||
return x
|
||||
|
||||
|
||||
def make_fused_moe_layer(
|
||||
rank: int,
|
||||
layer_idx: int,
|
||||
test_config: TestConfig,
|
||||
) -> FusedMoE:
|
||||
fml = FusedMoE(
|
||||
num_experts=test_config.num_experts,
|
||||
top_k=test_config.num_topk,
|
||||
hidden_size=test_config.hidden_size,
|
||||
intermediate_size=test_config.intermediate_size,
|
||||
prefix=f"dummy_layer_{layer_idx}",
|
||||
activation="silu",
|
||||
is_act_and_mul=True,
|
||||
params_dtype=test_config.weight_dtype,
|
||||
)
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
|
||||
from functools import partial
|
||||
|
||||
_make_expert_weights = partial(
|
||||
make_expert_weights,
|
||||
layer_idx=layer_idx,
|
||||
global_num_experts=test_config.num_experts,
|
||||
tensor_device=device,
|
||||
)
|
||||
|
||||
assert isinstance(fml.w13_weight.data, torch.Tensor)
|
||||
assert isinstance(fml.w2_weight.data, torch.Tensor)
|
||||
fml.w13_weight.data = fml.w13_weight.data.to(device=device)
|
||||
fml.w2_weight.data = fml.w2_weight.data.to(device=device)
|
||||
w13_weight = fml.w13_weight.data
|
||||
w2_weight = fml.w2_weight.data
|
||||
assert w13_weight.size(0) == test_config.num_local_experts
|
||||
for i in range(test_config.num_local_experts):
|
||||
g_i = rank * test_config.num_local_experts + i
|
||||
w13_weight_e = w13_weight[i]
|
||||
w2_weight_e = w2_weight[i]
|
||||
w13_weight_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w13_weight_e.shape,
|
||||
tensor_dtype=w13_weight_e.dtype,
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
w2_weight_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w2_weight_e.shape,
|
||||
tensor_dtype=w2_weight_e.dtype,
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
|
||||
block_size = 16
|
||||
|
||||
def block_quant_scales_shape(
|
||||
shape: tuple[int, ...], is_column_major: bool
|
||||
) -> tuple[int, ...]:
|
||||
assert len(shape) == 3
|
||||
if not is_column_major:
|
||||
return (shape[0], shape[1] // block_size, shape[2] // block_size)
|
||||
else:
|
||||
return (shape[0], shape[2] // block_size, shape[1] // block_size)
|
||||
|
||||
is_column_major = test_config.column_major_scales
|
||||
w13_weight_scale_inv = torch.empty(
|
||||
block_quant_scales_shape(w13_weight.shape, is_column_major),
|
||||
dtype=test_config.weight_dtype,
|
||||
device=device,
|
||||
)
|
||||
w2_weight_scale_inv = torch.empty(
|
||||
block_quant_scales_shape(w2_weight.shape, is_column_major),
|
||||
dtype=test_config.weight_dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
for i in range(test_config.num_local_experts):
|
||||
g_i = rank * test_config.num_local_experts + i
|
||||
w13_s_e = w13_weight_scale_inv[i]
|
||||
w2_s_e = w2_weight_scale_inv[i]
|
||||
w13_s_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w13_s_e.shape,
|
||||
tensor_dtype=w13_s_e.dtype,
|
||||
# Fill data in row-major and then
|
||||
# transpose if test_config requires col-major.
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
w2_s_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w2_s_e.shape,
|
||||
tensor_dtype=w2_s_e.dtype,
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
if is_column_major:
|
||||
w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
|
||||
w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
|
||||
assert not w13_weight_scale_inv.is_contiguous()
|
||||
assert not w2_weight_scale_inv.is_contiguous()
|
||||
|
||||
# Add scales to the parameter list
|
||||
fml.w13_weight_scale_inv = torch.nn.Parameter(
|
||||
w13_weight_scale_inv, requires_grad=False
|
||||
)
|
||||
fml.w2_weight_scale_inv = torch.nn.Parameter(
|
||||
w2_weight_scale_inv, requires_grad=False
|
||||
)
|
||||
|
||||
return fml
|
||||
|
||||
|
||||
def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
|
||||
# Initialize model parallel (using tensor parallel as an entrypoint
|
||||
# to expert parallel)
|
||||
set_env_vars_and_device(env)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.tensor_parallel_size = world_size
|
||||
vllm_config.parallel_config.enable_expert_parallel = True
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
|
||||
fml_layers = [
|
||||
make_fused_moe_layer(ep_rank, layer_idx, test_config)
|
||||
for layer_idx in range(test_config.num_layers)
|
||||
]
|
||||
rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
|
||||
|
||||
indices = torch.zeros(
|
||||
test_config.num_layers, test_config.num_experts, dtype=torch.long
|
||||
)
|
||||
for lidx in range(test_config.num_layers):
|
||||
indices[lidx] = torch.Tensor(range(test_config.num_experts))
|
||||
|
||||
shuffled_indices = torch.zeros_like(indices)
|
||||
for lidx in range(test_config.num_layers):
|
||||
shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
|
||||
|
||||
rearrange_expert_weights_inplace(
|
||||
indices,
|
||||
shuffled_indices,
|
||||
rank_expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
num_local_experts = test_config.num_local_experts
|
||||
num_global_experts = test_config.num_experts
|
||||
for lidx, fml in enumerate(fml_layers):
|
||||
for name, w in fml.named_parameters():
|
||||
for e in range(num_local_experts):
|
||||
g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
|
||||
ref = make_expert_weights(
|
||||
layer_idx=lidx,
|
||||
global_expert_idx=int(g_e.item()),
|
||||
global_num_experts=num_global_experts,
|
||||
tensor_shape=w[e].shape,
|
||||
tensor_dtype=w[e].dtype,
|
||||
tensor_device=w[e].device,
|
||||
is_column_major=not w[e].is_contiguous(),
|
||||
)
|
||||
assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
|
||||
f"w[{e}] {w[e].size()} {w[e].stride()} vs "
|
||||
f"ref {ref.size()} {ref.stride()}"
|
||||
)
|
||||
torch.testing.assert_close(w[e], ref)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@pytest.mark.parametrize("num_layers", [4])
|
||||
@pytest.mark.parametrize("num_experts", [16])
|
||||
@pytest.mark.parametrize("hidden_size", [256])
|
||||
@pytest.mark.parametrize("intermediate_size", [256])
|
||||
@pytest.mark.parametrize("column_major_scales", [True, False])
|
||||
def test_eplb_fml(
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_experts: int,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
column_major_scales: bool,
|
||||
):
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
|
||||
num_local_experts = num_experts // world_size
|
||||
num_topk = 4
|
||||
# The dtypes are fine as we are essentially just checking data-copies
|
||||
weight_dtype = torch.bfloat16
|
||||
weight_scale_dtype = torch.bfloat16
|
||||
|
||||
test_config = TestConfig(
|
||||
num_layers=num_layers,
|
||||
num_experts=num_experts,
|
||||
num_local_experts=num_local_experts,
|
||||
num_topk=num_topk,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
weight_dtype=weight_dtype,
|
||||
weight_scale_dtype=weight_scale_dtype,
|
||||
column_major_scales=column_major_scales,
|
||||
)
|
||||
|
||||
distributed_run(
|
||||
_test_eplb_fml,
|
||||
world_size,
|
||||
test_config,
|
||||
)
|
||||
276
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
Normal file
276
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
Normal file
@@ -0,0 +1,276 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Test that the interaction between EPLB and FusedMoE Layer is okay for DP w/ NVFP4
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.moe.utils import make_test_quant_config
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_dp_group,
|
||||
)
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
from vllm.model_executor.layers.quantization.modelopt import (
|
||||
ModelOptNvFp4Config,
|
||||
ModelOptNvFp4FusedMoE,
|
||||
)
|
||||
|
||||
from .eplb_utils import distributed_run, set_env_vars_and_device
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestConfig:
|
||||
num_layers: int
|
||||
num_experts: int
|
||||
num_local_experts: int
|
||||
num_topk: int
|
||||
hidden_size: int
|
||||
intermediate_size: int
|
||||
num_tokens: int
|
||||
|
||||
|
||||
def make_fused_moe_layer(
|
||||
rank: int,
|
||||
layer_idx: int,
|
||||
test_config: TestConfig,
|
||||
) -> FusedMoE:
|
||||
quant_config = None
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
|
||||
quant_config = ModelOptNvFp4Config(
|
||||
is_checkpoint_nvfp4_serialized=True,
|
||||
kv_cache_quant_algo=None,
|
||||
exclude_modules=[],
|
||||
)
|
||||
|
||||
fml = FusedMoE(
|
||||
num_experts=test_config.num_experts,
|
||||
top_k=test_config.num_topk,
|
||||
hidden_size=test_config.hidden_size,
|
||||
intermediate_size=test_config.intermediate_size,
|
||||
prefix=f"dummy_layer_{layer_idx}",
|
||||
activation="silu",
|
||||
is_act_and_mul=True,
|
||||
params_dtype=torch.bfloat16,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
nvfp4_fused_moe = ModelOptNvFp4FusedMoE(quant_config, fml)
|
||||
nvfp4_fused_moe.create_weights(
|
||||
fml,
|
||||
test_config.num_local_experts,
|
||||
test_config.hidden_size,
|
||||
test_config.intermediate_size,
|
||||
params_dtype=torch.uint8,
|
||||
global_num_experts=test_config.num_experts,
|
||||
)
|
||||
|
||||
fml = fml.to(device)
|
||||
w1_q, w2_q, quant_config = make_test_quant_config(
|
||||
test_config.num_local_experts,
|
||||
test_config.intermediate_size,
|
||||
test_config.hidden_size,
|
||||
in_dtype=torch.bfloat16,
|
||||
quant_dtype="nvfp4",
|
||||
block_shape=None,
|
||||
per_act_token_quant=False,
|
||||
)
|
||||
|
||||
fml.w13_weight.data = w1_q
|
||||
fml.w2_weight.data = w2_q
|
||||
|
||||
fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5
|
||||
fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5
|
||||
fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5
|
||||
fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5
|
||||
fml.w2_weight_scale.data = (
|
||||
torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5
|
||||
).to(fml.w2_weight_scale.data.dtype)
|
||||
fml.w13_weight_scale.data = (
|
||||
torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5
|
||||
).to(fml.w13_weight_scale.data.dtype)
|
||||
|
||||
nvfp4_fused_moe.process_weights_after_loading(fml)
|
||||
|
||||
fml.maybe_init_modular_kernel()
|
||||
|
||||
return fml
|
||||
|
||||
|
||||
def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
|
||||
set_env_vars_and_device(env)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.data_parallel_size = world_size
|
||||
vllm_config.parallel_config.enable_expert_parallel = True
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=1, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_dp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
fml_layers = [
|
||||
make_fused_moe_layer(ep_rank, layer_idx, test_config).to(device)
|
||||
for layer_idx in range(test_config.num_layers)
|
||||
]
|
||||
rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
|
||||
|
||||
hidden_states = []
|
||||
router_logits = []
|
||||
for layer_idx in range(test_config.num_layers):
|
||||
hidden_states.append(
|
||||
torch.randn(
|
||||
(test_config.num_tokens, test_config.hidden_size),
|
||||
dtype=torch.bfloat16,
|
||||
device=device,
|
||||
)
|
||||
)
|
||||
router_logits.append(
|
||||
torch.randn(
|
||||
(test_config.num_tokens, test_config.num_experts),
|
||||
dtype=torch.bfloat16,
|
||||
device=device,
|
||||
)
|
||||
)
|
||||
|
||||
out_before_shuffle = []
|
||||
with set_forward_context(
|
||||
{},
|
||||
num_tokens=test_config.num_tokens,
|
||||
num_tokens_across_dp=torch.tensor(
|
||||
[test_config.num_tokens] * world_size, device="cpu", dtype=torch.int
|
||||
),
|
||||
vllm_config=vllm_config,
|
||||
):
|
||||
for lidx, fml in enumerate(fml_layers):
|
||||
out_before_shuffle.append(
|
||||
fml(hidden_states[lidx].clone(), router_logits[lidx].clone())
|
||||
)
|
||||
|
||||
indices = torch.zeros(
|
||||
test_config.num_layers, test_config.num_experts, dtype=torch.long
|
||||
)
|
||||
for lidx in range(test_config.num_layers):
|
||||
indices[lidx] = torch.Tensor(range(test_config.num_experts))
|
||||
|
||||
shuffled_indices = torch.zeros_like(indices)
|
||||
for lidx in range(test_config.num_layers):
|
||||
shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
|
||||
|
||||
rearrange_expert_weights_inplace(
|
||||
indices,
|
||||
shuffled_indices,
|
||||
rank_expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
num_global_experts = test_config.num_experts
|
||||
|
||||
logical_to_physical_map_list = []
|
||||
for lidx, fml in enumerate(fml_layers):
|
||||
physical_to_logical_map = shuffled_indices[lidx].to(device)
|
||||
logical_to_physical_map = torch.empty(
|
||||
(num_global_experts,), dtype=torch.int32, device=device
|
||||
)
|
||||
logical_to_physical_map[physical_to_logical_map] = torch.arange(
|
||||
0, num_global_experts, dtype=torch.int32, device=device
|
||||
)
|
||||
logical_to_physical_map_list.append(
|
||||
logical_to_physical_map.reshape(num_global_experts, 1)
|
||||
)
|
||||
|
||||
logical_to_physical_map = torch.stack(logical_to_physical_map_list)
|
||||
|
||||
for lidx, fml in enumerate(fml_layers):
|
||||
logical_replica_count = torch.ones(
|
||||
(test_config.num_layers, num_global_experts),
|
||||
dtype=torch.int32,
|
||||
device=device,
|
||||
)
|
||||
fml.enable_eplb = True
|
||||
fml.set_eplb_state(
|
||||
lidx,
|
||||
torch.zeros(
|
||||
(test_config.num_layers, num_global_experts),
|
||||
dtype=torch.int32,
|
||||
device=device,
|
||||
),
|
||||
logical_to_physical_map,
|
||||
logical_replica_count,
|
||||
)
|
||||
|
||||
out_after_shuffle = []
|
||||
with set_forward_context(
|
||||
{},
|
||||
num_tokens=test_config.num_tokens,
|
||||
num_tokens_across_dp=torch.tensor(
|
||||
[test_config.num_tokens] * world_size, device="cpu", dtype=torch.int
|
||||
),
|
||||
vllm_config=vllm_config,
|
||||
):
|
||||
for lidx, fml in enumerate(fml_layers):
|
||||
out_after_shuffle.append(
|
||||
fml(hidden_states[lidx].clone(), router_logits[lidx].clone())
|
||||
)
|
||||
|
||||
for lidx in range(test_config.num_layers):
|
||||
torch.testing.assert_close(
|
||||
out_before_shuffle[lidx], out_after_shuffle[lidx], atol=1e-1, rtol=1e-1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
@pytest.mark.parametrize("num_layers", [8])
|
||||
@pytest.mark.parametrize("num_experts", [32])
|
||||
@pytest.mark.parametrize("hidden_size", [256])
|
||||
@pytest.mark.parametrize("intermediate_size", [256])
|
||||
@pytest.mark.parametrize("num_tokens", [256])
|
||||
@pytest.mark.parametrize("backend", ["latency", "throughput"])
|
||||
def test_eplb_fml(
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_experts: int,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
num_tokens: int,
|
||||
backend: str,
|
||||
monkeypatch,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend)
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
|
||||
num_local_experts = num_experts // world_size
|
||||
num_topk = 4
|
||||
|
||||
test_config = TestConfig(
|
||||
num_layers=num_layers,
|
||||
num_experts=num_experts,
|
||||
num_local_experts=num_local_experts,
|
||||
num_topk=num_topk,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
num_tokens=num_tokens,
|
||||
)
|
||||
|
||||
distributed_run(
|
||||
_test_eplb_fml,
|
||||
world_size,
|
||||
test_config,
|
||||
)
|
||||
142
tests/distributed/test_eplb_spec_decode.py
Normal file
142
tests/distributed/test_eplb_spec_decode.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
|
||||
from tests.utils import large_gpu_mark
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def get_model_args(
|
||||
model_name: str,
|
||||
spec_model_name: str | None,
|
||||
spec_method: str,
|
||||
tp_size: int,
|
||||
model_max_len: int,
|
||||
use_async: bool = False,
|
||||
) -> dict:
|
||||
speculative_config = {
|
||||
"method": spec_method,
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 1,
|
||||
"max_model_len": model_max_len,
|
||||
}
|
||||
eplb_config = {
|
||||
"num_redundant_experts": tp_size,
|
||||
"window_size": 128,
|
||||
"step_interval": 1024,
|
||||
"log_balancedness": False,
|
||||
}
|
||||
if use_async:
|
||||
eplb_config["use_async"] = True
|
||||
model_args = {
|
||||
"pretrained": model_name,
|
||||
"dtype": "auto",
|
||||
"add_bos_token": True,
|
||||
"tensor_parallel_size": tp_size,
|
||||
"gpu_memory_utilization": 0.7,
|
||||
"speculative_config": speculative_config,
|
||||
"enable_expert_parallel": True,
|
||||
"eplb_config": eplb_config,
|
||||
"enable_eplb": True,
|
||||
"max_model_len": model_max_len,
|
||||
}
|
||||
return model_args
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="EPLB with Spec Decode is a work in progress on ROCm.",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_setup",
|
||||
[
|
||||
pytest.param(
|
||||
("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86),
|
||||
marks=large_gpu_mark(min_gb=80),
|
||||
),
|
||||
pytest.param(
|
||||
(
|
||||
"eagle",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
|
||||
4,
|
||||
0.92,
|
||||
),
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
|
||||
),
|
||||
],
|
||||
ids=["qwen3_next_mtp", "llama4_eagle"],
|
||||
)
|
||||
def test_eplb_spec_decode(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model_setup: tuple[str, str, str, int, float],
|
||||
):
|
||||
"""
|
||||
Test the correctness of EPLB speculative decoding with GSM8K dataset.
|
||||
Applicable to MoE models with mtp or eagle spec decode.
|
||||
"""
|
||||
method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup
|
||||
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
|
||||
model_args = get_model_args(
|
||||
model_name=model_name,
|
||||
spec_model_name=spec_model_name,
|
||||
spec_method=method,
|
||||
tp_size=tp_size,
|
||||
model_max_len=4096,
|
||||
)
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=TASK,
|
||||
batch_size=64,
|
||||
num_fewshot=8,
|
||||
)
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (
|
||||
measured_value - RTOL < expected_gsm8k_value
|
||||
and measured_value + RTOL > expected_gsm8k_value
|
||||
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
|
||||
|
||||
|
||||
@large_gpu_mark(min_gb=80)
|
||||
def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
|
||||
"""
|
||||
Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
|
||||
"""
|
||||
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
expected_gsm8k_value = 0.86
|
||||
|
||||
model_args = get_model_args(
|
||||
model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
spec_model_name=None,
|
||||
spec_method="mtp",
|
||||
tp_size=4,
|
||||
model_max_len=4096,
|
||||
use_async=True,
|
||||
)
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=TASK,
|
||||
batch_size=64,
|
||||
num_fewshot=8,
|
||||
)
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (
|
||||
measured_value - RTOL < expected_gsm8k_value
|
||||
and measured_value + RTOL > expected_gsm8k_value
|
||||
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
|
||||
314
tests/distributed/test_events.py
Normal file
314
tests/distributed/test_events.py
Normal file
@@ -0,0 +1,314 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import threading
|
||||
import time
|
||||
|
||||
import msgspec
|
||||
import pytest
|
||||
|
||||
from vllm.distributed.kv_events import (
|
||||
EventBatch,
|
||||
EventPublisherFactory,
|
||||
NullEventPublisher,
|
||||
)
|
||||
|
||||
DP_RANK = 0
|
||||
|
||||
|
||||
class EventSample(
|
||||
msgspec.Struct,
|
||||
tag=True, # type: ignore
|
||||
array_like=True, # type: ignore
|
||||
):
|
||||
"""Test event for publisher testing"""
|
||||
|
||||
id: int
|
||||
value: str
|
||||
|
||||
|
||||
class SampleBatch(EventBatch):
|
||||
"""Test event batch for publisher testing"""
|
||||
|
||||
events: list[EventSample]
|
||||
|
||||
|
||||
def create_test_events(count: int) -> SampleBatch:
|
||||
"""Create a batch of test events"""
|
||||
events = [EventSample(id=i, value=f"test-{i}") for i in range(count)]
|
||||
return SampleBatch(ts=time.time(), events=events)
|
||||
|
||||
|
||||
def test_basic_publishing(publisher, subscriber):
|
||||
"""Test basic event publishing works"""
|
||||
|
||||
test_batch = create_test_events(5)
|
||||
publisher.publish(test_batch)
|
||||
|
||||
result = subscriber.receive_one(timeout=1000)
|
||||
assert result is not None, "No message received"
|
||||
|
||||
seq, received = result
|
||||
assert seq == 0, "Sequence number mismatch"
|
||||
assert received.ts == pytest.approx(test_batch.ts, abs=0.1), "Timestamp mismatch"
|
||||
assert len(received.events) == len(test_batch.events), "Number of events mismatch"
|
||||
|
||||
for i, event in enumerate(received.events):
|
||||
assert event.id == i, "Event id mismatch"
|
||||
assert event.value == f"test-{i}", "Event value mismatch"
|
||||
|
||||
|
||||
def test_multiple_events(publisher, subscriber):
|
||||
"""Test publishing and receiving multiple event batches"""
|
||||
for _ in range(10):
|
||||
batch = create_test_events(2)
|
||||
publisher.publish(batch)
|
||||
|
||||
received = []
|
||||
for _ in range(10):
|
||||
data = subscriber.receive_one(timeout=100)
|
||||
if data:
|
||||
received.append(data)
|
||||
|
||||
assert len(received) == 10, "Number of messages mismatch"
|
||||
seqs = [seq for seq, _ in received]
|
||||
assert seqs == list(range(10)), "Sequence numbers mismatch"
|
||||
|
||||
|
||||
def test_replay_mechanism(publisher, subscriber):
|
||||
"""Test the replay mechanism works correctly"""
|
||||
for _ in range(19):
|
||||
batch = create_test_events(1)
|
||||
publisher.publish(batch)
|
||||
|
||||
time.sleep(0.5) # Need publisher to process above requests
|
||||
subscriber.request_replay(10)
|
||||
|
||||
batch = create_test_events(1)
|
||||
publisher.publish(batch) # 20th message
|
||||
|
||||
replayed = subscriber.receive_replay()
|
||||
|
||||
assert len(replayed) > 0, "No replayed messages received"
|
||||
seqs = [seq for seq, _ in replayed]
|
||||
assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
|
||||
assert seqs == list(range(min(seqs), max(seqs) + 1)), (
|
||||
"Replayed messages not consecutive"
|
||||
)
|
||||
|
||||
|
||||
def test_buffer_limit(publisher, subscriber, publisher_config):
|
||||
"""Test buffer limit behavior"""
|
||||
buffer_size = publisher_config.buffer_steps
|
||||
|
||||
# Publish more events than the buffer can hold
|
||||
for i in range(buffer_size + 10):
|
||||
batch = create_test_events(1)
|
||||
publisher.publish(batch)
|
||||
|
||||
time.sleep(0.5) # Need publisher to process above requests
|
||||
subscriber.request_replay(0)
|
||||
|
||||
batch = create_test_events(1)
|
||||
publisher.publish(batch)
|
||||
|
||||
replayed = subscriber.receive_replay()
|
||||
|
||||
assert len(replayed) <= buffer_size, "Can't replay more than buffer size"
|
||||
|
||||
oldest_seq = min(seq for seq, _ in replayed)
|
||||
assert oldest_seq >= 10, "The oldest sequence should be at least 10"
|
||||
|
||||
|
||||
def test_topic_filtering(publisher_config):
|
||||
"""
|
||||
Test that a subscriber only receives messages matching its topic filter
|
||||
"""
|
||||
publisher_config.replay_endpoint = None
|
||||
|
||||
publisher_config.topic = "foo"
|
||||
pub = EventPublisherFactory.create(publisher_config, DP_RANK)
|
||||
|
||||
from .conftest import MockSubscriber
|
||||
|
||||
sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
|
||||
sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
|
||||
|
||||
try:
|
||||
time.sleep(0.1)
|
||||
|
||||
for _ in range(3):
|
||||
pub.publish(create_test_events(1))
|
||||
|
||||
foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
|
||||
assert all(msg is not None for msg in foo_received), (
|
||||
"Subscriber with matching topic should receive messages"
|
||||
)
|
||||
|
||||
bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
|
||||
assert all(msg is None for msg in bar_received), (
|
||||
"Subscriber with non-matching topic should receive no messages"
|
||||
)
|
||||
finally:
|
||||
pub.shutdown()
|
||||
sub_foo.close()
|
||||
sub_bar.close()
|
||||
|
||||
|
||||
def test_high_volume(publisher, subscriber):
|
||||
"""Test publishing and receiving a high volume of events"""
|
||||
num_batches = 10_000
|
||||
events_per_batch = 100
|
||||
|
||||
# Publish events in a separate thread to not block
|
||||
def publish_events():
|
||||
for i in range(num_batches):
|
||||
batch = create_test_events(events_per_batch)
|
||||
publisher.publish(batch)
|
||||
# Small delay to avoid overwhelming
|
||||
if i % 100 == 0:
|
||||
time.sleep(0.01)
|
||||
|
||||
received: list[tuple[int, SampleBatch]] = []
|
||||
|
||||
publisher_thread = threading.Thread(target=publish_events)
|
||||
publisher_thread.start()
|
||||
|
||||
start_time = time.time()
|
||||
while len(received) < num_batches:
|
||||
if time.time() - start_time > 10: # Timeout after 10 seconds
|
||||
break
|
||||
|
||||
result = subscriber.receive_one(timeout=100)
|
||||
if result:
|
||||
received.append(result)
|
||||
|
||||
publisher_thread.join()
|
||||
|
||||
assert len(received) >= num_batches * 0.9, "We should have received most messages"
|
||||
|
||||
seqs = [seq for seq, _ in received]
|
||||
assert sorted(seqs) == seqs, "Sequence numbers should be in order"
|
||||
|
||||
|
||||
def test_null_publisher():
|
||||
"""Test that NullEventPublisher can be used without errors"""
|
||||
publisher = NullEventPublisher(DP_RANK)
|
||||
|
||||
# This should not raise any errors
|
||||
batch = create_test_events(5)
|
||||
publisher.publish(batch)
|
||||
publisher.shutdown()
|
||||
|
||||
|
||||
def test_data_parallel_rank_tagging(publisher_config):
|
||||
"""Test that events are properly tagged with their data parallel rank"""
|
||||
|
||||
publisher_config.topic = "foo"
|
||||
pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK)
|
||||
pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1)
|
||||
|
||||
# Hardcode the expected endpoints based on port offsetting behavior
|
||||
# Both ranks get offsets according to _offset_endpoint_port function
|
||||
base_endpoint = publisher_config.endpoint
|
||||
if "tcp://" in base_endpoint:
|
||||
# For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
|
||||
expected_endpoint_0 = base_endpoint # rank 0 gets port + 0 = same port
|
||||
expected_endpoint_1 = base_endpoint.replace(
|
||||
":5557", ":5558"
|
||||
) # rank 1 gets port + 1
|
||||
else:
|
||||
# For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
|
||||
expected_endpoint_0 = base_endpoint # rank 0 gets base
|
||||
expected_endpoint_1 = base_endpoint + "_dp1" # rank 1 gets _dp1
|
||||
|
||||
from .conftest import MockSubscriber
|
||||
|
||||
sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
|
||||
sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
|
||||
|
||||
try:
|
||||
time.sleep(0.1) # Let publishers start up
|
||||
|
||||
# Publish events from different ranks
|
||||
batch_0 = create_test_events(2)
|
||||
batch_1 = create_test_events(3)
|
||||
|
||||
pub_0.publish(batch_0)
|
||||
pub_1.publish(batch_1)
|
||||
|
||||
# Receive events from rank 0
|
||||
result_0 = sub_0.receive_one(timeout=200)
|
||||
assert result_0 is not None, "No message received from rank 0"
|
||||
seq_0, received_0 = result_0
|
||||
|
||||
# Receive events from rank 1
|
||||
result_1 = sub_1.receive_one(timeout=200)
|
||||
assert result_1 is not None, "No message received from rank 1"
|
||||
seq_1, received_1 = result_1
|
||||
|
||||
# Verify DP rank tagging
|
||||
assert received_0.data_parallel_rank == 0, (
|
||||
f"Expected DP rank 0, got {received_0.data_parallel_rank}"
|
||||
)
|
||||
assert received_1.data_parallel_rank == 1, (
|
||||
f"Expected DP rank 1, got {received_1.data_parallel_rank}"
|
||||
)
|
||||
|
||||
# Verify event content is correct
|
||||
assert len(received_0.events) == 2, "Wrong number of events from rank 0"
|
||||
assert len(received_1.events) == 3, "Wrong number of events from rank 1"
|
||||
|
||||
finally:
|
||||
pub_0.shutdown()
|
||||
pub_1.shutdown()
|
||||
sub_0.close()
|
||||
sub_1.close()
|
||||
|
||||
|
||||
def test_event_publisher_factory():
|
||||
"""Test event publisher factory creation behavior under different configurations"""
|
||||
from vllm.config.kv_events import KVEventsConfig
|
||||
from vllm.distributed.kv_events import ZmqEventPublisher
|
||||
|
||||
# test config is None
|
||||
publisher = EventPublisherFactory.create(None, DP_RANK)
|
||||
assert isinstance(publisher, NullEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
# test disable kv cache events
|
||||
config = KVEventsConfig(
|
||||
enable_kv_cache_events=False,
|
||||
publisher="zmq", # Even if zmq is specified, should return NullEventPublisher
|
||||
endpoint="tcp://localhost:5557",
|
||||
)
|
||||
publisher = EventPublisherFactory.create(config, DP_RANK)
|
||||
assert isinstance(publisher, NullEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
# test zmq publisher
|
||||
config = KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
publisher="zmq",
|
||||
endpoint="inproc://test-factory-true",
|
||||
)
|
||||
publisher = EventPublisherFactory.create(config, DP_RANK)
|
||||
assert isinstance(publisher, ZmqEventPublisher)
|
||||
publisher.shutdown()
|
||||
|
||||
# test unknown publisher
|
||||
with pytest.raises(ValueError, match="Input should be"):
|
||||
KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
publisher="unknown_publisher",
|
||||
endpoint="tcp://localhost:5557",
|
||||
)
|
||||
|
||||
# test publisher not specified
|
||||
config = KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
# publisher not specified, should default to "zmq"
|
||||
endpoint="tcp://localhost:5557",
|
||||
)
|
||||
publisher = EventPublisherFactory.create(config, DP_RANK)
|
||||
assert isinstance(publisher, ZmqEventPublisher)
|
||||
publisher.shutdown()
|
||||
231
tests/distributed/test_expert_parallel.py
Normal file
231
tests/distributed/test_expert_parallel.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_expert_parallel")
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class EPTestOptions(NamedTuple):
|
||||
trust_remote_code: bool
|
||||
tokenizer_mode: str | None
|
||||
load_format: str | None = None
|
||||
hf_overrides: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EPTestSettings:
|
||||
parallel_setups: list[ParallelSetup]
|
||||
distributed_backends: list[str]
|
||||
runner: RunnerOption
|
||||
test_options: EPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
runner: RunnerOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: str | None = None,
|
||||
load_format: str | None = None,
|
||||
hf_overrides: str | None = None,
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
|
||||
ParallelSetup(
|
||||
tp_size=2 * tp_base, eager_mode=False, chunked_prefill=True
|
||||
),
|
||||
ParallelSetup(
|
||||
tp_size=2 * tp_base, eager_mode=True, chunked_prefill=False
|
||||
),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=EPTestOptions(
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides,
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
runner: RunnerOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: str | None = None,
|
||||
load_format: str | None = None,
|
||||
hf_overrides: str | None = None,
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
runner=runner,
|
||||
test_options=EPTestOptions(
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides,
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_name: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (
|
||||
model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
self.runner,
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
TEST_MODELS = {
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(trust_remote_code=True),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
|
||||
}
|
||||
|
||||
|
||||
def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate"],
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
(
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
load_format,
|
||||
hf_overrides,
|
||||
) = test_options
|
||||
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Need at least {tp_size} GPUs")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
"--load-format",
|
||||
"auto",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", hf_overrides])
|
||||
|
||||
ep_env = {
|
||||
"VLLM_TEST_ENABLE_EP": "1",
|
||||
}
|
||||
|
||||
ep_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
# compare without expert parallelism
|
||||
tp_env = {
|
||||
"VLLM_TEST_ENABLE_EP": "0",
|
||||
}
|
||||
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(
|
||||
model_name,
|
||||
ep_args,
|
||||
tp_args,
|
||||
ep_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=360,
|
||||
)
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params
|
||||
for model_name, settings in TEST_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ep(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: EPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(
|
||||
model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
)
|
||||
244
tests/distributed/test_expert_placement.py
Normal file
244
tests/distributed/test_expert_placement.py
Normal file
@@ -0,0 +1,244 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
|
||||
|
||||
|
||||
def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts):
|
||||
"""Verify that the expert map follows the round_robin pattern."""
|
||||
# Calculate expected local experts (supporting non-divisible cases)
|
||||
base_experts = global_num_experts // ep_size
|
||||
remainder = global_num_experts % ep_size
|
||||
|
||||
local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts
|
||||
|
||||
# Expected expert IDs for this rank in round_robin pattern
|
||||
# For non-divisible cases, ranks with extra experts start earlier
|
||||
expected_expert_ids = []
|
||||
for expert_idx in range(local_num_experts):
|
||||
global_expert_id = ep_rank + expert_idx * ep_size
|
||||
expected_expert_ids.append(global_expert_id)
|
||||
|
||||
# Check that only expected experts are mapped to this rank
|
||||
for global_expert_id in range(global_num_experts):
|
||||
if global_expert_id in expected_expert_ids:
|
||||
local_expert_id = expert_map[global_expert_id]
|
||||
expected_local_id = expected_expert_ids.index(global_expert_id)
|
||||
assert local_expert_id == expected_local_id, (
|
||||
f"Global expert {global_expert_id} should map to local expert "
|
||||
f"{expected_local_id}, got {local_expert_id}"
|
||||
)
|
||||
else:
|
||||
assert expert_map[global_expert_id] == -1, (
|
||||
f"Global expert {global_expert_id} should not be mapped to this rank"
|
||||
)
|
||||
|
||||
# Verify that all local expert IDs are consecutive starting from 0
|
||||
local_expert_ids = [expert_map[global_id] for global_id in expected_expert_ids]
|
||||
expected_local_ids = list(range(local_num_experts))
|
||||
assert local_expert_ids == expected_local_ids, (
|
||||
f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
|
||||
"""Test round_robin expert placement with various expert counts."""
|
||||
|
||||
# Test with different global_num_experts values
|
||||
# Include both divisible and non-divisible cases
|
||||
if world_size == 2:
|
||||
test_cases = [
|
||||
(4, 2), # 4 experts (divisible)
|
||||
(8, 2), # 8 experts (divisible)
|
||||
(9, 2), # 9 experts (non-divisible)
|
||||
(16, 2), # 16 experts (divisible)
|
||||
(17, 2), # 17 experts (non-divisible)
|
||||
]
|
||||
elif world_size == 4:
|
||||
test_cases = [
|
||||
(8, 4), # 8 experts (divisible)
|
||||
(16, 4), # 16 experts (divisible)
|
||||
(18, 4), # 18 experts (non-divisible)
|
||||
(32, 4), # 32 experts (divisible)
|
||||
(33, 4), # 33 experts (non-divisible)
|
||||
]
|
||||
else:
|
||||
test_cases = []
|
||||
|
||||
for test_global_experts, test_ep_size in test_cases:
|
||||
# Ensure ep_size matches world_size
|
||||
assert test_ep_size == world_size, (
|
||||
f"ep_size {test_ep_size} must equal world_size {world_size}"
|
||||
)
|
||||
|
||||
# Test each rank
|
||||
for ep_rank in range(world_size):
|
||||
# Calculate expected local experts
|
||||
base_experts = test_global_experts // test_ep_size
|
||||
remainder = test_global_experts % test_ep_size
|
||||
if ep_rank < remainder:
|
||||
expected_test_local = base_experts + 1
|
||||
else:
|
||||
expected_test_local = base_experts
|
||||
|
||||
test_local_experts, test_expert_map, _ = determine_expert_map(
|
||||
ep_size=test_ep_size,
|
||||
ep_rank=ep_rank,
|
||||
global_num_experts=test_global_experts,
|
||||
expert_placement_strategy=expert_placement_strategy,
|
||||
)
|
||||
|
||||
assert test_local_experts == expected_test_local, (
|
||||
f"For {test_global_experts} experts on {test_ep_size} ranks, "
|
||||
f"rank {ep_rank}: expected {expected_test_local} local"
|
||||
f"experts, got {test_local_experts}"
|
||||
)
|
||||
|
||||
if test_expert_map is not None:
|
||||
assert test_expert_map.shape == (test_global_experts,), (
|
||||
f"Expected expert map shape ({test_global_experts},), "
|
||||
f"got {test_expert_map.shape}"
|
||||
)
|
||||
|
||||
# Verify round_robin pattern for this test case
|
||||
verify_round_robin_pattern(
|
||||
test_expert_map, ep_rank, test_ep_size, test_global_experts
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
def test_expert_placement_edge_cases(expert_placement_strategy, world_size):
|
||||
"""Test edge cases for round_robin expert placement."""
|
||||
|
||||
# Test case 1: ep_size = 1 (should return None for expert_map)
|
||||
local_num_experts, expert_map, _ = determine_expert_map(
|
||||
ep_size=1,
|
||||
ep_rank=0,
|
||||
global_num_experts=8,
|
||||
expert_placement_strategy=expert_placement_strategy,
|
||||
)
|
||||
assert local_num_experts == 8, "For ep_size=1, should get all experts"
|
||||
assert expert_map is None, "For ep_size=1, expert_map should be None"
|
||||
|
||||
# Test case 2: ep_size = 0 (should raise assertion)
|
||||
with pytest.raises(AssertionError):
|
||||
determine_expert_map(
|
||||
ep_size=0,
|
||||
ep_rank=0,
|
||||
global_num_experts=8,
|
||||
expert_placement_strategy=expert_placement_strategy,
|
||||
)
|
||||
|
||||
|
||||
def test_determine_expert_map_comprehensive():
|
||||
"""Test of determine_expert_map function with various configurations."""
|
||||
|
||||
# Test cases: (ep_size, ep_rank, global_num_experts,
|
||||
# expert_placement_strategy, expected_local, expected_map_pattern)
|
||||
test_cases = [
|
||||
# Round robin placement tests
|
||||
(
|
||||
2,
|
||||
0,
|
||||
8,
|
||||
"round_robin",
|
||||
4,
|
||||
[0, -1, 1, -1, 2, -1, 3, -1],
|
||||
), # rank 0 gets even experts
|
||||
(
|
||||
2,
|
||||
1,
|
||||
8,
|
||||
"round_robin",
|
||||
4,
|
||||
[-1, 0, -1, 1, -1, 2, -1, 3],
|
||||
), # rank 1 gets odd experts
|
||||
(
|
||||
2,
|
||||
0,
|
||||
9,
|
||||
"round_robin",
|
||||
5,
|
||||
[0, -1, 1, -1, 2, -1, 3, -1, 4],
|
||||
), # rank 0 gets 5 experts (even + last)
|
||||
(
|
||||
2,
|
||||
1,
|
||||
9,
|
||||
"round_robin",
|
||||
4,
|
||||
[-1, 0, -1, 1, -1, 2, -1, 3, -1],
|
||||
), # rank 1 gets 4 experts (odd)
|
||||
# 4-rank tests
|
||||
(
|
||||
4,
|
||||
0,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[0, -1, -1, -1, 1, -1, -1, -1],
|
||||
), # rank 0 gets experts 0, 4
|
||||
(
|
||||
4,
|
||||
1,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[-1, 0, -1, -1, -1, 1, -1, -1],
|
||||
), # rank 1 gets experts 1, 5
|
||||
(
|
||||
4,
|
||||
2,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[-1, -1, 0, -1, -1, -1, 1, -1],
|
||||
), # rank 2 gets experts 2, 6
|
||||
(
|
||||
4,
|
||||
3,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[-1, -1, -1, 0, -1, -1, -1, 1],
|
||||
), # rank 3 gets experts 3, 7
|
||||
]
|
||||
|
||||
for (
|
||||
ep_size,
|
||||
ep_rank,
|
||||
global_num_experts,
|
||||
expert_placement_strategy,
|
||||
expected_local,
|
||||
expected_map_pattern,
|
||||
) in test_cases:
|
||||
local_num_experts, expert_map, _ = determine_expert_map(
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
global_num_experts=global_num_experts,
|
||||
expert_placement_strategy=expert_placement_strategy,
|
||||
)
|
||||
|
||||
assert local_num_experts == expected_local, (
|
||||
f"ep_size={ep_size}, ep_rank={ep_rank}, "
|
||||
f"global_num_experts={global_num_experts}, "
|
||||
f"expert_placement_strategy={expert_placement_strategy}: "
|
||||
f"expected {expected_local} local experts, got {local_num_experts}"
|
||||
)
|
||||
|
||||
if expected_map_pattern is None:
|
||||
assert expert_map is None, "Expected expert_map to be None"
|
||||
else:
|
||||
assert expert_map is not None, "Expected expert_map to not be None"
|
||||
actual_map = expert_map.tolist()
|
||||
assert actual_map == expected_map_pattern, (
|
||||
f"ep_size={ep_size}, ep_rank={ep_rank}, "
|
||||
f"global_num_experts={global_num_experts}, "
|
||||
f"expert_placement_strategy={expert_placement_strategy}: "
|
||||
f"expected map {expected_map_pattern}, got {actual_map}"
|
||||
)
|
||||
78
tests/distributed/test_kvlayout.py
Normal file
78
tests/distributed/test_kvlayout.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config import (
|
||||
DeviceConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import (
|
||||
get_kv_connector_cache_layout,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger("test_expert_parallel")
|
||||
|
||||
|
||||
def test_get_kv_connector_cache_layout_without_kv_connector():
|
||||
vllm_config = VllmConfig(device_config=DeviceConfig("cpu"))
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
assert layout == "NHD"
|
||||
|
||||
|
||||
def test_get_kv_connector_cache_layout_with_lmcache_connector():
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="LMCacheConnectorV1",
|
||||
kv_role="kv_both",
|
||||
)
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"), kv_transfer_config=kv_transfer_config
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
assert layout == "NHD"
|
||||
|
||||
|
||||
def test_get_kv_connector_cache_layout_with_nixl_connector():
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="NixlConnector",
|
||||
kv_role="kv_both",
|
||||
)
|
||||
model_config = ModelConfig()
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
assert layout == "HND"
|
||||
|
||||
|
||||
def test_get_kv_connector_cache_layout_with_multi_connector():
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="MultiConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"connectors": [
|
||||
{"kv_connector": "ExampleConnector", "kv_role": "kv_both"},
|
||||
{"kv_connector": "NixlConnector", "kv_role": "kv_both"},
|
||||
]
|
||||
},
|
||||
)
|
||||
model_config = ModelConfig()
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
assert layout == "HND"
|
||||
64
tests/distributed/test_multi_node_assignment.py
Normal file
64
tests/distributed/test_multi_node_assignment.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Make sure ray assigns GPU workers to the correct node.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_multi_node_assignment.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
from vllm import initialize_ray_cluster
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.utils.network_utils import get_ip
|
||||
from vllm.v1.executor.ray_utils import _wait_until_pg_removed
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not VLLM_MULTI_NODE, reason="Need at least 2 nodes to run the test."
|
||||
)
|
||||
def test_multi_node_assignment() -> None:
|
||||
# NOTE: important to keep this class definition here
|
||||
# to let ray use cloudpickle to serialize it.
|
||||
class Actor:
|
||||
def get_ip(self):
|
||||
return get_ip()
|
||||
|
||||
for _ in range(10):
|
||||
config = ParallelConfig(1, 2)
|
||||
initialize_ray_cluster(config)
|
||||
|
||||
current_ip = get_ip()
|
||||
workers = []
|
||||
for bundle_id, bundle in enumerate(config.placement_group.bundle_specs):
|
||||
if not bundle.get("GPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=config.placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=1,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
)(Actor).remote()
|
||||
worker_ip = ray.get(worker.get_ip.remote())
|
||||
assert worker_ip == current_ip
|
||||
workers.append(worker)
|
||||
|
||||
for worker in workers:
|
||||
ray.kill(worker)
|
||||
|
||||
_wait_until_pg_removed(config.placement_group)
|
||||
437
tests/distributed/test_multiproc_executor.py
Normal file
437
tests/distributed/test_multiproc_executor.py
Normal file
@@ -0,0 +1,437 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Integration tests for MultiprocExecutor at the executor level.
|
||||
This test directly tests the executor without going through the LLM interface,
|
||||
focusing on executor initialization, RPC calls, and distributed execution.
|
||||
"""
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
|
||||
|
||||
def create_vllm_config(
|
||||
tensor_parallel_size: int = 1,
|
||||
pipeline_parallel_size: int = 1,
|
||||
max_model_len: int = 256,
|
||||
gpu_memory_utilization: float = 0.3,
|
||||
distributed_executor_backend: str = "mp",
|
||||
nnodes: int = 1,
|
||||
node_rank: int = 0,
|
||||
master_port: int = 0,
|
||||
) -> VllmConfig:
|
||||
"""Create a VllmConfig for testing using EngineArgs."""
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
pipeline_parallel_size=pipeline_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
|
||||
# Override distributed node settings if needed
|
||||
if nnodes > 1 or node_rank > 0:
|
||||
vllm_config.parallel_config.nnodes = nnodes
|
||||
vllm_config.parallel_config.node_rank = node_rank
|
||||
vllm_config.parallel_config.master_port = master_port
|
||||
if nnodes > 1:
|
||||
vllm_config.parallel_config.disable_custom_all_reduce = True
|
||||
|
||||
return vllm_config
|
||||
|
||||
|
||||
def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
|
||||
"""Create a minimal SchedulerOutput for testing."""
|
||||
# This is a simplified version - in practice you'd need proper
|
||||
# SchedulerOutput construction based on the actual vLLM v1 API
|
||||
return SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_resumed_reqs=[],
|
||||
scheduled_running_reqs=[],
|
||||
num_scheduled_tokens={},
|
||||
total_num_scheduled_tokens=0,
|
||||
)
|
||||
|
||||
|
||||
def test_multiproc_executor_initialization():
|
||||
"""Test that MultiprocExecutor can be initialized with proper config."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor - this should initialize workers
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 1, "World size should be 1 for single GPU"
|
||||
assert executor.local_world_size == 1, "Local world size should be 1"
|
||||
assert hasattr(executor, "workers"), "Executor should have workers"
|
||||
assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
|
||||
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_initialization_tensor_parallel():
|
||||
"""Test MultiprocExecutor initialization with tensor parallelism."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 2, "World size should be 2 for TP=2"
|
||||
assert executor.local_world_size == 2, "Local world size should be 2"
|
||||
assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
|
||||
|
||||
# Verify output rank calculation
|
||||
output_rank = executor._get_output_rank()
|
||||
assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
|
||||
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_collective_rpc():
|
||||
"""Test collective RPC calls to all workers."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test check_health RPC - should work without errors
|
||||
executor.check_health()
|
||||
|
||||
# Test that RPC works correctly
|
||||
# Note: We're just testing that the RPC mechanism works,
|
||||
# not testing actual model execution here
|
||||
assert not executor.is_failed, "Executor should not be in failed state"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_failure_callback():
|
||||
"""Test failure callback registration and invocation."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test callback registration
|
||||
callback_invoked = []
|
||||
|
||||
def test_callback():
|
||||
callback_invoked.append(True)
|
||||
|
||||
# Register callback
|
||||
executor.register_failure_callback(test_callback)
|
||||
|
||||
# Callback should not be invoked yet
|
||||
assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
|
||||
|
||||
# Simulate failure
|
||||
executor.is_failed = True
|
||||
|
||||
# Register another callback - should be invoked immediately
|
||||
executor.register_failure_callback(test_callback)
|
||||
assert len(callback_invoked) == 1, (
|
||||
"Callback should be invoked when executor is failed"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_worker_monitor():
|
||||
"""Test that worker monitor is set up correctly."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Verify all worker processes are alive
|
||||
for worker in executor.workers:
|
||||
assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
|
||||
|
||||
# Verify executor is not in failed state
|
||||
assert not executor.is_failed, "Executor should not be in failed state"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
# After shutdown, workers should be terminated
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # Give processes time to terminate
|
||||
for worker in executor.workers:
|
||||
assert not worker.proc.is_alive(), (
|
||||
f"Worker rank {worker.rank} should terminate after shutdown"
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_get_response_message_queues():
|
||||
"""Test message queue retrieval for different ranks."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Get all message queues
|
||||
all_queues = executor.get_response_mqs()
|
||||
assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
|
||||
|
||||
# Get message queue for specific rank
|
||||
rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
|
||||
assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
|
||||
|
||||
rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
|
||||
assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_shutdown_cleanup():
|
||||
"""Test that shutdown properly cleans up resources."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor is set up
|
||||
assert hasattr(executor, "workers"), "Executor should have workers"
|
||||
assert len(executor.workers) > 0, "Should have at least one worker"
|
||||
|
||||
# Shutdown
|
||||
executor.shutdown()
|
||||
|
||||
# Verify cleanup
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # Give processes time to terminate
|
||||
|
||||
for worker in executor.workers:
|
||||
assert not worker.proc.is_alive(), "Worker processes should be terminated"
|
||||
|
||||
# Verify shutdown event is set
|
||||
assert executor.shutdown_event.is_set(), "Shutdown event should be set"
|
||||
|
||||
# Multiple shutdowns should be safe (idempotent)
|
||||
executor.shutdown()
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_multiproc_executor_pipeline_parallel():
|
||||
"""Test MultiprocExecutor with pipeline parallelism."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
|
||||
assert len(executor.workers) == 4, "Should have 4 workers"
|
||||
|
||||
# Verify output rank calculation
|
||||
# For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
|
||||
# Specifically rank 2 (first rank of last PP stage)
|
||||
output_rank = executor._get_output_rank()
|
||||
assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
|
||||
|
||||
# Verify max_concurrent_batches for pipeline parallel
|
||||
assert executor.max_concurrent_batches == 2, (
|
||||
"Max concurrent batches should equal PP size"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_properties():
|
||||
"""Test various executor properties and configurations."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test supports_pp property
|
||||
assert MultiprocExecutor.supports_pp is True, (
|
||||
"MultiprocExecutor should support pipeline parallelism"
|
||||
)
|
||||
|
||||
# Test world_size calculation
|
||||
assert executor.world_size == (
|
||||
executor.parallel_config.tensor_parallel_size
|
||||
* executor.parallel_config.pipeline_parallel_size
|
||||
), "World size should equal TP * PP"
|
||||
|
||||
# Test local_world_size calculation
|
||||
assert executor.local_world_size == (
|
||||
executor.parallel_config.world_size // executor.parallel_config.nnodes
|
||||
), "Local world size should be world_size / nnodes"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_multiproc_executor_multi_node():
|
||||
"""
|
||||
Test MultiprocExecutor with multi-node configuration.
|
||||
This simulates 2 nodes with TP=4:
|
||||
- Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
|
||||
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
|
||||
Total world_size = 4, nnodes = 2
|
||||
"""
|
||||
port = get_open_port()
|
||||
# symm_mem does not work for simulating multi instance in single node
|
||||
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||
|
||||
def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
|
||||
"""Run a single node's executor."""
|
||||
executor = None
|
||||
try:
|
||||
# Set CUDA_VISIBLE_DEVICES for this node
|
||||
if node_rank == 0:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
||||
|
||||
# Create config for this node
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=4, # Total TP across all nodes
|
||||
pipeline_parallel_size=1,
|
||||
nnodes=2, # 2 nodes
|
||||
node_rank=node_rank,
|
||||
master_port=port, # same port
|
||||
)
|
||||
|
||||
# Create executor for this node
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify node-specific properties
|
||||
assert executor.world_size == 4, (
|
||||
f"World size should be 4 on node {node_rank}"
|
||||
)
|
||||
assert executor.local_world_size == 2, (
|
||||
f"Local world size should be 2 on node {node_rank}"
|
||||
)
|
||||
assert len(executor.workers) == 2, (
|
||||
f"Should have 2 local workers on node {node_rank}"
|
||||
)
|
||||
|
||||
# Verify worker ranks are correct for this node
|
||||
expected_ranks = [node_rank * 2, node_rank * 2 + 1]
|
||||
actual_ranks = sorted([w.rank for w in executor.workers])
|
||||
assert actual_ranks == expected_ranks, (
|
||||
f"Node {node_rank} should have workers "
|
||||
f"with ranks {expected_ranks}, got {actual_ranks}"
|
||||
)
|
||||
# Verify all workers are alive
|
||||
for worker in executor.workers:
|
||||
assert worker.proc.is_alive(), (
|
||||
f"Worker rank {worker.rank} should be alive on node {node_rank}"
|
||||
)
|
||||
# executor.gen
|
||||
# Put success result in queue BEFORE shutdown to avoid hanging
|
||||
result_queue.put({"node": node_rank, "success": True})
|
||||
import time
|
||||
|
||||
time.sleep(2)
|
||||
executor.shutdown()
|
||||
except Exception as e:
|
||||
# Put failure result in queue
|
||||
result_queue.put({"node": node_rank, "success": False, "error": str(e)})
|
||||
raise e
|
||||
finally:
|
||||
if executor is not None:
|
||||
executor.shutdown()
|
||||
|
||||
# Create a queue to collect results from both processes
|
||||
result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
|
||||
|
||||
# Start both node processes
|
||||
processes = []
|
||||
for node_rank in range(2):
|
||||
p = multiprocessing.Process(
|
||||
target=run_node,
|
||||
args=(node_rank, result_queue, port),
|
||||
name=f"Node{node_rank}",
|
||||
)
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
# Wait for both processes to complete
|
||||
all_completed = True
|
||||
for p in processes:
|
||||
p.join(timeout=60)
|
||||
if p.is_alive():
|
||||
p.terminate()
|
||||
p.join(timeout=20)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
all_completed = False
|
||||
|
||||
# Check results from both nodes
|
||||
results: list[dict[str, int | bool]] = []
|
||||
while len(results) < 2:
|
||||
try:
|
||||
result = result_queue.get(timeout=1)
|
||||
results.append(result)
|
||||
except Exception:
|
||||
pass
|
||||
assert all_completed, "Not all processes completed successfully"
|
||||
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
|
||||
assert results[0]["success"], f"Node 0 failed: {results[0]}"
|
||||
assert results[1]["success"], f"Node 1 failed: {results[1]}"
|
||||
94
tests/distributed/test_nccl_symm_mem_allreduce.py
Normal file
94
tests/distributed/test_nccl_symm_mem_allreduce.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
import typing
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
|
||||
from vllm.distributed.device_communicators.pynccl_allocator import (
|
||||
get_nccl_mem_pool,
|
||||
is_symmetric_memory_enabled,
|
||||
)
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(44)
|
||||
|
||||
test_size_elements = 4 * 1024 * 1024
|
||||
|
||||
|
||||
def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
|
||||
monkeypatch = pytest.MonkeyPatch()
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
dtype = torch.bfloat16
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
update_environment_variables(
|
||||
{
|
||||
"RANK": str(local_rank),
|
||||
"LOCAL_RANK": str(local_rank),
|
||||
"WORLD_SIZE": str(world_size),
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
}
|
||||
)
|
||||
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
cuda_communicator = typing.cast(
|
||||
CudaCommunicator, get_tp_group().device_communicator
|
||||
)
|
||||
pynccl_comm = cuda_communicator.pynccl_comm
|
||||
if get_nccl_mem_pool() is None:
|
||||
pytest.skip(
|
||||
"NCCL allocator compilation failed (probably missing NCCL headers)."
|
||||
)
|
||||
if not is_symmetric_memory_enabled():
|
||||
pytest.skip("NCCL symmetric memory allreduce is disabled.")
|
||||
|
||||
register_nccl_symmetric_ops(pynccl_comm)
|
||||
input = torch.randint(1, 23, (test_size_elements,), dtype=dtype, device=device)
|
||||
input_clone = input.clone()
|
||||
output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
|
||||
assert output is not None
|
||||
|
||||
group = get_tp_group().device_group
|
||||
dist.all_reduce(input_clone, group=group)
|
||||
torch.testing.assert_close(output, input_clone, atol=2.5, rtol=0.1)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
|
||||
)
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
|
||||
# Enable SymmMemCommunicator
|
||||
monkeypatch.setenv("VLLM_USE_NCCL_SYMM_MEM", "1")
|
||||
monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
|
||||
monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")
|
||||
|
||||
mp.spawn(nccl_symm_mem_allreduce_worker, args=(world_size,), nprocs=world_size)
|
||||
cleanup_dist_env_and_memory()
|
||||
46
tests/distributed/test_node_count.py
Normal file
46
tests/distributed/test_node_count.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import _node_count
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
|
||||
if rank == 0:
|
||||
port = get_open_port()
|
||||
ip = get_ip()
|
||||
dist.broadcast_object_list([ip, port], src=0)
|
||||
else:
|
||||
recv = [None, None]
|
||||
dist.broadcast_object_list(recv, src=0)
|
||||
ip, port = recv
|
||||
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank, world_size)
|
||||
|
||||
for pg in [dist.group.WORLD, stateless_pg]:
|
||||
test_result = _node_count(pg)
|
||||
|
||||
# Expected node count based on environment variable)
|
||||
expected = int(os.environ.get("NUM_NODES", "1"))
|
||||
|
||||
assert test_result == expected, f"Expected {expected} nodes, got {test_result}"
|
||||
|
||||
if pg == dist.group.WORLD:
|
||||
print(
|
||||
f"Node count test passed! Got {test_result} nodes "
|
||||
f"when using torch distributed!"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Node count test passed! Got {test_result} nodes "
|
||||
f"when using StatelessProcessGroup!"
|
||||
)
|
||||
442
tests/distributed/test_pipeline_parallel.py
Normal file
442
tests/distributed/test_pipeline_parallel.py
Normal file
@@ -0,0 +1,442 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
|
||||
important to set the distributed backend to "mp" to avoid Ray scheduling
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_pipeline_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
eager_mode: bool
|
||||
|
||||
|
||||
class PPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PPTestSettings:
|
||||
parallel_setups: list[ParallelSetup]
|
||||
distributed_backends: list[str]
|
||||
runner: RunnerOption
|
||||
test_options: PPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: str | None = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
|
||||
ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
|
||||
ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
|
||||
ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
|
||||
ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=PPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
runner=runner,
|
||||
test_options=PPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for backend in self.distributed_backends:
|
||||
yield (model_id, parallel_setup, backend, self.runner, opts)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
# Uses Llama
|
||||
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
|
||||
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
|
||||
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
||||
"zai-org/chatglm3-6b": PPTestSettings.fast(),
|
||||
"CohereLabs/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
|
||||
"databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
|
||||
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
|
||||
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
|
||||
"tiiuae/falcon-7b": PPTestSettings.fast(),
|
||||
"google/gemma-1.1-2b-it": PPTestSettings.fast(),
|
||||
"google/gemma-2-9b": PPTestSettings.fast(),
|
||||
"gpt2": PPTestSettings.fast(),
|
||||
"bigcode/starcoder": PPTestSettings.fast(),
|
||||
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
|
||||
"EleutherAI/pythia-1.4b": PPTestSettings.fast(),
|
||||
"ibm/PowerLM-3b": PPTestSettings.fast(),
|
||||
"ibm/PowerMoE-3b": PPTestSettings.fast(),
|
||||
# Uses Llama
|
||||
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
|
||||
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
|
||||
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
||||
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
|
||||
"pfnet/plamo-2-1b": PPTestSettings.fast(),
|
||||
"pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(),
|
||||
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
|
||||
# Tests TransformersForCausalLM
|
||||
"hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
|
||||
# Uses Llama
|
||||
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
|
||||
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
|
||||
"mosaicml/mpt-7b": PPTestSettings.fast(),
|
||||
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
|
||||
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
|
||||
"allenai/OLMo-2-0425-1B": PPTestSettings.fast(),
|
||||
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
|
||||
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
|
||||
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
|
||||
"adept/persimmon-8b-chat": PPTestSettings.fast(),
|
||||
"microsoft/phi-2": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
|
||||
multi_node_only=True, load_format="dummy"
|
||||
),
|
||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
|
||||
"bigcode/starcoder2-3b": PPTestSettings.fast(),
|
||||
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
# FIXME: Cannot load tokenizer in latest transformers version.
|
||||
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
|
||||
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
|
||||
# [Encoder-only]
|
||||
# TODO: Implement PP
|
||||
# "facebook/bart-base": PPTestSettings.fast(),
|
||||
}
|
||||
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
|
||||
load_format="dummy", runner="pooling"
|
||||
),
|
||||
}
|
||||
|
||||
MULTIMODAL_MODELS = {
|
||||
# [Decoder-only]
|
||||
"Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
|
||||
"facebook/chameleon-7b": PPTestSettings.fast(),
|
||||
"adept/fuyu-8b": PPTestSettings.fast(),
|
||||
"zai-org/glm-4v-9b": PPTestSettings.fast(),
|
||||
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
|
||||
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
|
||||
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
|
||||
"AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
|
||||
"AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
|
||||
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
|
||||
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
|
||||
}
|
||||
|
||||
# NOTE: You can update this on your local machine to run specific tests
|
||||
TEST_MODELS = [
|
||||
# [LANGUAGE GENERATION]
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"hmellor/Ilama-3.2-1B",
|
||||
"ibm/PowerLM-3b",
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
# [MULTIMODAL GENERATION]
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"microsoft/Phi-3.5-vision-instruct",
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||
# [LANGUAGE GENERATION - HYBRID ARCH]
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
]
|
||||
|
||||
|
||||
def _compare_tp(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate", "encode"],
|
||||
is_multimodal: bool,
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
pp_size,
|
||||
eager_mode,
|
||||
) = parallel_setup
|
||||
|
||||
multi_node_only, load_format = test_options
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
hf_config = get_config(model_id, trust_remote_code)
|
||||
require_embed_inputs = model_info.require_embed_inputs
|
||||
max_num_seqs = model_info.max_num_seqs
|
||||
|
||||
dtype = "float16"
|
||||
if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
|
||||
dtype = "bfloat16"
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
text_overrides = {
|
||||
"num_hidden_layers": 4,
|
||||
"hidden_size": 512,
|
||||
"intermediate_size": 800,
|
||||
"num_attention_heads": 4,
|
||||
"num_key_value_heads": 1,
|
||||
}
|
||||
|
||||
if is_multimodal:
|
||||
hf_overrides.update({"text_config": text_overrides})
|
||||
else:
|
||||
hf_overrides.update(text_overrides)
|
||||
else:
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip(
|
||||
"Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend"
|
||||
)
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
dtype,
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
]
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
if require_embed_inputs:
|
||||
common_args.extend(
|
||||
[
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-prompt-embeds",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
)
|
||||
if max_num_seqs:
|
||||
common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
|
||||
|
||||
if distributed_backend == "ray":
|
||||
# Test Ray Compiled Graph for all the tests
|
||||
pp_env = {
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
}
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of a Ray Compiled Graph issue.
|
||||
common_args.append("--disable-frontend-multiprocessing")
|
||||
elif distributed_backend == "mp":
|
||||
pp_env = None
|
||||
else:
|
||||
pp_env = None
|
||||
|
||||
tp_env = None
|
||||
|
||||
pp_args = [
|
||||
*common_args,
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
# compare without pipeline parallelism
|
||||
# NOTE: use mp backend for TP
|
||||
# PP tests might involve multiple nodes, and ray might
|
||||
# schedule all workers in a node other than the head node,
|
||||
# which can cause the test to fail.
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
|
||||
compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params
|
||||
for model_id, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_language_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params
|
||||
for model_id, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_language_embedding(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="encode",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params
|
||||
for model_id, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_multimodal_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=True,
|
||||
)
|
||||
67
tests/distributed/test_pipeline_partition.py
Normal file
67
tests/distributed/test_pipeline_partition.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.distributed.utils import get_pp_indices
|
||||
|
||||
|
||||
def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
|
||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
||||
m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
|
||||
for pp_rank, golden in enumerate(goldens):
|
||||
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
||||
if bak is not None:
|
||||
m.setenv("VLLM_PP_LAYER_PARTITION", bak)
|
||||
|
||||
# Even partition
|
||||
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Balanced partition
|
||||
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
||||
# Put reminder somewhere
|
||||
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
||||
# Invalid partition strings
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of partitions
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of layers
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_hidden_layers,pp_size,pp_rank,indices",
|
||||
[
|
||||
# pp_size 2
|
||||
(2, 2, 0, (0, 1)),
|
||||
(2, 2, 1, (1, 2)),
|
||||
(3, 2, 0, (0, 2)),
|
||||
(3, 2, 1, (2, 3)),
|
||||
# pp_size 3
|
||||
(3, 3, 0, (0, 1)),
|
||||
(3, 3, 1, (1, 2)),
|
||||
(3, 3, 2, (2, 3)),
|
||||
(4, 3, 0, (0, 1)),
|
||||
(4, 3, 1, (1, 3)),
|
||||
(4, 3, 2, (3, 4)),
|
||||
(5, 3, 0, (0, 2)),
|
||||
(5, 3, 1, (2, 4)),
|
||||
(5, 3, 2, (4, 5)),
|
||||
],
|
||||
)
|
||||
def test_uneven_auto_partition(
|
||||
num_hidden_layers: int,
|
||||
pp_size: int,
|
||||
pp_rank: int,
|
||||
indices: tuple[int, int],
|
||||
):
|
||||
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
|
||||
42
tests/distributed/test_pp_cudagraph.py
Normal file
42
tests/distributed/test_pp_cudagraph.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from typing_extensions import LiteralString
|
||||
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"PP_SIZE, MODEL_NAME",
|
||||
[
|
||||
(2, "JackFram/llama-160m"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ATTN_BACKEND",
|
||||
[
|
||||
"FLASH_ATTN",
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_pp_cudagraph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
PP_SIZE: int,
|
||||
MODEL_NAME: str,
|
||||
ATTN_BACKEND: LiteralString,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
cudagraph_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--pipeline-parallel-size",
|
||||
str(PP_SIZE),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
|
||||
|
||||
eager_args = cudagraph_args + ["--enforce-eager"]
|
||||
|
||||
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
||||
@@ -1,30 +1,40 @@
|
||||
import multiprocessing
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
import multiprocess as mp
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
import vllm.distributed.device_communicators.pymccl_utils as pymccl_utils
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
|
||||
ncclGetUniqueId)
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
|
||||
init_distributed_environment, with_pynccl_for_all_reduce)
|
||||
from vllm.utils import update_environment_variables
|
||||
ensure_model_parallel_initialized,
|
||||
get_world_group,
|
||||
graph_capture,
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes = []
|
||||
processes: list[mp.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
env: dict[str, str] = {}
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = mp.Process(target=fn, args=(env,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
@@ -41,6 +51,9 @@ def worker_fn_wrapper(fn):
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
fn()
|
||||
|
||||
@@ -49,105 +62,345 @@ def worker_fn_wrapper(fn):
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
comm = NCCLCommunicator()
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
|
||||
comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == comm.world_size
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
tensor = pynccl_comm.all_reduce(tensor)
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl():
|
||||
distributed_run(worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_tp_worker_fn():
|
||||
def multiple_allreduce_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo")
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo"),
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
|
||||
comm = NCCLCommunicator(group=group, device=device)
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
# two groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
comm.all_reduce(tensor)
|
||||
comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
tensor = pynccl_comm.all_reduce(tensor)
|
||||
tensor = pynccl_comm.all_reduce(tensor)
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == 4).cpu().item()
|
||||
else:
|
||||
comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
tensor = pynccl_comm.all_reduce(tensor)
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == 2).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_tp():
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_multiple_allreduce():
|
||||
# this tests pynccl for multiple tp groups, in a standalone way
|
||||
# i.e. call `comm.all_reduce` directly
|
||||
distributed_run(multiple_tp_worker_fn, 4)
|
||||
# i.e. call `pynccl_comm.all_reduce` directly
|
||||
distributed_run(multiple_allreduce_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_tp_with_vllm_worker_fn():
|
||||
def multiple_allreduce_with_vllm_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
torch.cuda.set_device(torch.distributed.get_rank())
|
||||
ensure_model_parallel_initialized(2, 2)
|
||||
pymccl_utils.init_process_group(
|
||||
group=get_tensor_model_parallel_cpu_group())
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with with_pynccl_for_all_reduce():
|
||||
with graph_capture(device=device):
|
||||
# two tp groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == 4).cpu().item()
|
||||
else:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == 2).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_tp_with_vllm():
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_multiple_allreduce_with_vllm():
|
||||
# this tests pynccl for multiple tp groups, together with vllm
|
||||
# i.e. call `tensor_model_parallel_all_reduce`
|
||||
distributed_run(multiple_tp_with_vllm_worker_fn, 4)
|
||||
distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn_with_cudagraph():
|
||||
with torch.no_grad():
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
comm = NCCLCommunicator()
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
# run something in the default stream to initialize torch engine
|
||||
a = torch.ones((4, 4), device=f'cuda:{comm.rank}')
|
||||
a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
|
||||
torch.cuda.synchronize()
|
||||
with torch.cuda.graph(graph):
|
||||
a_out = pynccl_comm.all_reduce(a)
|
||||
torch.cuda.synchronize()
|
||||
with torch.cuda.graph(graph, stream=comm.stream):
|
||||
# operation during the graph capture is recorded but not executed
|
||||
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
|
||||
comm.all_reduce(a)
|
||||
comm.stream.synchronize()
|
||||
assert a.mean().cpu().item() == comm.world_size**0
|
||||
graph.replay()
|
||||
comm.stream.synchronize()
|
||||
assert a.mean().cpu().item() == comm.world_size**1
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(a_out == pynccl_comm.world_size).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@worker_fn_wrapper
|
||||
def all_gather_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
num_elems = 1000
|
||||
tensor = (
|
||||
torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
|
||||
)
|
||||
result = torch.zeros(num_elems * world_size, dtype=torch.float32, device=device)
|
||||
|
||||
expected = torch.cat(
|
||||
[
|
||||
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
|
||||
for r in range(world_size)
|
||||
]
|
||||
).to(device)
|
||||
|
||||
pynccl_comm.all_gather(result, tensor)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_all_gather():
|
||||
distributed_run(all_gather_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def all_gatherv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
assert world_size <= 8
|
||||
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
|
||||
num_elems = sizes[rank]
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
|
||||
result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
|
||||
|
||||
expected = torch.cat(
|
||||
[
|
||||
torch.arange(sizes[r], dtype=torch.float32) + r * 100
|
||||
for r in range(world_size)
|
||||
]
|
||||
).to(device)
|
||||
|
||||
pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_all_gatherv():
|
||||
distributed_run(all_gatherv_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def reduce_scatter_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
num_elems = 1000
|
||||
tensor = (
|
||||
torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
|
||||
)
|
||||
assert num_elems % world_size == 0
|
||||
result = torch.zeros(num_elems // world_size, dtype=torch.float32, device=device)
|
||||
|
||||
# Calculate expected result for this rank's chunk
|
||||
scattered_size = num_elems // world_size
|
||||
all_tensors = [
|
||||
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
|
||||
for r in range(world_size)
|
||||
]
|
||||
expected = sum(
|
||||
tensor[rank * scattered_size : (rank + 1) * scattered_size]
|
||||
for tensor in all_tensors
|
||||
).to(device)
|
||||
|
||||
pynccl_comm.reduce_scatter(result, tensor)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_reduce_scatter():
|
||||
distributed_run(reduce_scatter_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def reduce_scatterv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
assert world_size <= 8
|
||||
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
|
||||
num_elems = sum(sizes)
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
|
||||
result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
|
||||
|
||||
# Calculate expected result for this rank's chunk
|
||||
all_tensors = [
|
||||
torch.arange(num_elems, dtype=torch.float32) + r * 100
|
||||
for r in range(world_size)
|
||||
]
|
||||
sizes_cumsum = np.cumsum(sizes)
|
||||
start = 0 if rank == 0 else sizes_cumsum[rank - 1]
|
||||
end = sizes_cumsum[rank]
|
||||
expected = sum(tensor[start:end] for tensor in all_tensors).to(device)
|
||||
|
||||
pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_reduce_scatterv():
|
||||
distributed_run(reduce_scatterv_worker_fn, 2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_with_cudagraph():
|
||||
distributed_run(worker_fn_with_cudagraph, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def send_recv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
if pynccl_comm.rank == 0:
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
else:
|
||||
tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
|
||||
if pynccl_comm.rank == 0:
|
||||
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == 1).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_send_recv():
|
||||
distributed_run(send_recv_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_send_recv_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[1, 3], backend="gloo"),
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
elif torch.distributed.get_rank() == 1:
|
||||
tensor = 2 * torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
else:
|
||||
tensor = torch.empty(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
|
||||
torch.cuda.synchronize()
|
||||
if torch.distributed.get_rank() in [0, 2]:
|
||||
assert torch.all(tensor == 1).cpu().item()
|
||||
else:
|
||||
assert torch.all(tensor == 2).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_multiple_send_recv():
|
||||
distributed_run(multiple_send_recv_worker_fn, 4)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_broadcast():
|
||||
distributed_run(broadcast_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def broadcast_worker_fn():
|
||||
# Test broadcast for every root rank.
|
||||
# Essentially this is an all-gather operation.
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
recv_tensors = [
|
||||
torch.empty(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
|
||||
for i in range(pynccl_comm.world_size)
|
||||
]
|
||||
recv_tensors[pynccl_comm.rank] = (
|
||||
torch.ones(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
|
||||
* pynccl_comm.rank
|
||||
)
|
||||
|
||||
for i in range(pynccl_comm.world_size):
|
||||
pynccl_comm.broadcast(recv_tensors[i], src=i)
|
||||
# the broadcast op might be launched in a different stream
|
||||
# need to synchronize to make sure the tensor is ready
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(recv_tensors[i] == i).cpu().item()
|
||||
|
||||
|
||||
def test_ncclGetUniqueId():
|
||||
unique_id = ncclGetUniqueId()
|
||||
lib = NCCLLibrary()
|
||||
unique_id = lib.ncclGetUniqueId()
|
||||
# `list(unique_id.internal)` is something like this:
|
||||
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
import multiprocessing
|
||||
import tempfile
|
||||
|
||||
|
||||
def target_fn(env, filepath):
|
||||
from vllm.utils import update_environment_variables
|
||||
update_environment_variables(env)
|
||||
from vllm.utils import nccl_integrity_check
|
||||
nccl_integrity_check(filepath)
|
||||
|
||||
|
||||
def test_library_file():
|
||||
# note: don't import vllm.distributed.device_communicators.pynccl
|
||||
# before running this test, otherwise the library file will be loaded
|
||||
# and it might interfere with the test
|
||||
from vllm.utils import find_nccl_library
|
||||
so_file = find_nccl_library()
|
||||
with open(so_file, 'rb') as f:
|
||||
content = f.read()
|
||||
try:
|
||||
# corrupt the library file, should raise an exception
|
||||
with open(so_file, 'wb') as f:
|
||||
f.write(content[:len(content) // 2])
|
||||
p = multiprocessing.Process(target=target_fn, args=({}, so_file))
|
||||
p.start()
|
||||
p.join()
|
||||
assert p.exitcode != 0
|
||||
|
||||
# move the library file to a tmp path
|
||||
# test VLLM_NCCL_SO_PATH
|
||||
fd, path = tempfile.mkstemp()
|
||||
with open(path, 'wb') as f:
|
||||
f.write(content)
|
||||
p = multiprocessing.Process(target=target_fn,
|
||||
args=({
|
||||
"VLLM_NCCL_SO_PATH": path
|
||||
}, path))
|
||||
p.start()
|
||||
p.join()
|
||||
assert p.exitcode == 0
|
||||
finally:
|
||||
with open(so_file, 'wb') as f:
|
||||
f.write(content)
|
||||
223
tests/distributed/test_quick_all_reduce.py
Normal file
223
tests/distributed/test_quick_all_reduce.py
Normal file
@@ -0,0 +1,223 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import multiprocessing
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.parallel_state import get_tp_group, graph_capture
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import (
|
||||
ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment,
|
||||
multi_process_parallel,
|
||||
)
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(44)
|
||||
# Size over 8MB is sufficient for custom quick allreduce.
|
||||
test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
|
||||
for i, v in enumerate(test_sizes):
|
||||
test_sizes[i] -= v % 8
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def graph_quickreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pp_size,
|
||||
rank,
|
||||
distributed_init_port,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tp_group().device_group
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
# this is needed because device communicators might be created lazily
|
||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||
# before any communication happens, so that this group can be used for
|
||||
# graph capture immediately.
|
||||
data = torch.zeros(1)
|
||||
data = data.to(device=device)
|
||||
torch.distributed.all_reduce(data, group=group)
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float16, torch.bfloat16]:
|
||||
with graph_capture(device=device) as graph_capture_context:
|
||||
inp1 = torch.randint(
|
||||
1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
inp2 = torch.randint(
|
||||
-23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
|
||||
for _ in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
dist.all_reduce(inp1, group=group)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2, group=group)
|
||||
graph.replay()
|
||||
torch.testing.assert_close(out1, inp1, atol=2.5, rtol=0.1)
|
||||
torch.testing.assert_close(out2, inp2, atol=2.5, rtol=0.1)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def eager_quickreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pp_size,
|
||||
rank,
|
||||
distributed_init_port,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
# Size over 8MB is sufficient for custom quick allreduce.
|
||||
sz = 16 * 1024 * 1024
|
||||
fa = get_tp_group().device_communicator.qr_comm
|
||||
inp = torch.tensor(
|
||||
[1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
|
||||
)
|
||||
out = fa.quick_all_reduce(inp)
|
||||
torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
|
||||
|
||||
inp = torch.tensor(
|
||||
[1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
|
||||
)
|
||||
out = fa.quick_all_reduce(inp)
|
||||
torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
|
||||
)
|
||||
@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||
@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
|
||||
def test_custom_quick_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pipeline_parallel_size,
|
||||
test_target,
|
||||
quant_mode,
|
||||
):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
|
||||
monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
|
||||
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
|
||||
|
||||
|
||||
def qr_variable_input(rank, world_size):
|
||||
"""
|
||||
When the tensor parallelism is set to 4 or 8, frequent changes
|
||||
in the input shape can cause QuickReduce to hang (this issue
|
||||
has been observed with the gpt_oss model).
|
||||
"""
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
qr_max_size = None # MB
|
||||
_ptr = ops.init_custom_qr(rank, world_size, qr_max_size)
|
||||
ranks = []
|
||||
for i in range(world_size):
|
||||
ranks.append(i)
|
||||
dist.init_process_group(
|
||||
backend="nccl",
|
||||
init_method="tcp://127.0.0.1:29500",
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
)
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="nccl")
|
||||
|
||||
handle = ops.qr_get_handle(_ptr)
|
||||
world_size = dist.get_world_size(group=cpu_group)
|
||||
handles = [None] * world_size
|
||||
dist.all_gather_object(handles, handle, group=cpu_group)
|
||||
ops.qr_open_handles(_ptr, handles)
|
||||
|
||||
num = 1
|
||||
s1 = 1024
|
||||
while num < 50000: # 50000 is sufficient to identify issues.
|
||||
dtype = torch.float16
|
||||
if num % 2 == 0:
|
||||
s2 = 1024
|
||||
inp1 = torch.zeros(
|
||||
(s1, s2), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
else:
|
||||
s2 = 2048
|
||||
inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device())
|
||||
result = torch.empty_like(inp1)
|
||||
# FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
|
||||
ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True)
|
||||
try:
|
||||
if inp1[0, 0] == 0:
|
||||
assert torch.all(result == 0)
|
||||
else:
|
||||
assert torch.all(result == world_size)
|
||||
except AssertionError:
|
||||
print("Assertion failed! Allreduce results are incorrect.")
|
||||
raise
|
||||
num += 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
|
||||
)
|
||||
@pytest.mark.parametrize("tp_size", [4, 8])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1])
|
||||
def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
|
||||
multiprocessing.set_start_method("spawn", force=True)
|
||||
# 60s is enough
|
||||
timeout = 60
|
||||
processes = []
|
||||
for rank in range(tp_size):
|
||||
p = multiprocessing.Process(target=qr_variable_input, args=(rank, tp_size))
|
||||
p.start()
|
||||
processes.append((rank, p))
|
||||
for rank, p in processes:
|
||||
p.join(timeout=timeout)
|
||||
if p.is_alive():
|
||||
for r, proc in processes:
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join()
|
||||
raise RuntimeError(f"QuickReduce hang detected after {timeout} seconds!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_custom_quick_allreduce_variable_input(tp_size=4, pipeline_parallel_size=1)
|
||||
49
tests/distributed/test_same_node.py
Normal file
49
tests/distributed/test_same_node.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
|
||||
def _run_test(pg):
|
||||
test_result = all(in_the_same_node_as(pg, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
if pg == dist.group.WORLD:
|
||||
print("Same node test passed! when using torch distributed!")
|
||||
else:
|
||||
print("Same node test passed! when using StatelessProcessGroup!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
rank = dist.get_rank()
|
||||
if rank == 0:
|
||||
port = get_open_port()
|
||||
ip = get_ip()
|
||||
dist.broadcast_object_list([ip, port], src=0)
|
||||
else:
|
||||
recv = [None, None]
|
||||
dist.broadcast_object_list(recv, src=0)
|
||||
ip, port = recv
|
||||
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
|
||||
|
||||
for pg in [dist.group.WORLD, stateless_pg]:
|
||||
if os.environ.get("VLLM_TEST_WITH_DEFAULT_DEVICE_SET", "0") == "1":
|
||||
default_devices = ["cpu"]
|
||||
if torch.cuda.is_available():
|
||||
default_devices.append("cuda")
|
||||
for device in default_devices:
|
||||
torch.set_default_device(device)
|
||||
_run_test(pg)
|
||||
else:
|
||||
_run_test(pg)
|
||||
352
tests/distributed/test_sequence_parallel.py
Normal file
352
tests/distributed/test_sequence_parallel.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
|
||||
important to set the distributed backend to "mp" to avoid Ray scheduling
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_sequence_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
fuse_norm_quant: bool
|
||||
fuse_act_quant: bool
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class SPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SPTestSettings:
|
||||
parallel_setups: list[ParallelSetup]
|
||||
distributed_backends: list[str]
|
||||
runner: RunnerOption
|
||||
test_options: SPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False, True]:
|
||||
for pp_multiplier in [1, 2]:
|
||||
for chunked_prefill_val in [False, True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
fuse_norm_quant=False,
|
||||
fuse_act_quant=False,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val,
|
||||
)
|
||||
)
|
||||
return SPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False, True]:
|
||||
for pp_multiplier in [1, 2]:
|
||||
for chunked_prefill_val in [False, True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
fuse_norm_quant=False,
|
||||
fuse_act_quant=False,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val,
|
||||
)
|
||||
)
|
||||
return SPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fp8_quant(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for fusion_val in [False, True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
fuse_norm_quant=fusion_val,
|
||||
fuse_act_quant=fusion_val,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False,
|
||||
)
|
||||
)
|
||||
return SPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for backend in self.distributed_backends:
|
||||
yield (
|
||||
model_id,
|
||||
parallel_setup,
|
||||
backend,
|
||||
self.runner,
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
def _compare_sp(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available: int,
|
||||
use_inductor_graph_partition: bool,
|
||||
fuse_gemm_comms: bool,
|
||||
*,
|
||||
method: Literal["generate", "encode"],
|
||||
is_multimodal: bool,
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
pp_size,
|
||||
fuse_norm_quant,
|
||||
fuse_act_quant,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
|
||||
multi_node_only, load_format = test_options
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
require_embed_inputs = model_info.require_embed_inputs
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
text_overrides = {
|
||||
"num_hidden_layers": 4,
|
||||
"hidden_size": 512,
|
||||
"intermediate_size": 800,
|
||||
"num_attention_heads": 4,
|
||||
"num_key_value_heads": 1,
|
||||
}
|
||||
|
||||
if is_multimodal:
|
||||
hf_overrides.update({"text_config": text_overrides})
|
||||
else:
|
||||
hf_overrides.update(text_overrides)
|
||||
else:
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip(
|
||||
"Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend"
|
||||
)
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
if require_embed_inputs:
|
||||
common_args.extend(
|
||||
[
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-prompt-embeds",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
)
|
||||
|
||||
compilation_config = {
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"compile_sizes": [4, 8],
|
||||
"pass_config": {
|
||||
"enable_sp": True,
|
||||
"fuse_gemm_comms": fuse_gemm_comms,
|
||||
"fuse_norm_quant": fuse_norm_quant,
|
||||
"fuse_act_quant": fuse_act_quant,
|
||||
"eliminate_noops": True,
|
||||
},
|
||||
"use_inductor_graph_partition": use_inductor_graph_partition,
|
||||
}
|
||||
|
||||
tp_sp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
"--compilation_config",
|
||||
json.dumps(compilation_config),
|
||||
]
|
||||
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
|
||||
compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
|
||||
|
||||
|
||||
SP_TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
"hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
|
||||
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
|
||||
}
|
||||
|
||||
SP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"hmellor/tiny-random-LlamaForCausalLM",
|
||||
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
(
|
||||
"model_id",
|
||||
"parallel_setup",
|
||||
"distributed_backend",
|
||||
"runner",
|
||||
"test_options",
|
||||
),
|
||||
[
|
||||
params
|
||||
for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in SP_TEST_MODELS
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||
@pytest.mark.parametrize("fuse_gemm_comms", [False]) # TODO: enable async TP
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_sp_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
runner: RunnerOption,
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available,
|
||||
use_inductor_graph_partition: bool,
|
||||
fuse_gemm_comms: bool,
|
||||
):
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
# Skip FP8 SP-only test on sm89 (compute capability 8.9)
|
||||
if (
|
||||
"fp8" in model_id.lower()
|
||||
and current_platform.get_device_capability() < (9, 0)
|
||||
and (not fuse_gemm_comms)
|
||||
):
|
||||
pytest.skip("FP8 reduction support begins with sm90 capable devices.")
|
||||
|
||||
_compare_sp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
use_inductor_graph_partition,
|
||||
fuse_gemm_comms=fuse_gemm_comms,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
117
tests/distributed/test_shm_broadcast.py
Normal file
117
tests/distributed/test_shm_broadcast.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import multiprocessing
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
|
||||
def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
|
||||
np.random.seed(seed)
|
||||
sizes = np.random.randint(1, 10_000, n)
|
||||
# on average, each array will have 5k elements
|
||||
# with int64, each array will have 40kb
|
||||
return [np.random.randint(1, 100, i) for i in sizes]
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes = []
|
||||
for i in range(number_of_processes):
|
||||
env = {}
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = multiprocessing.Process(target=fn, args=(env,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def worker_fn_wrapper(fn):
|
||||
# `multiprocessing.Process` cannot accept environment variables directly
|
||||
# so we need to pass the environment variables as arguments
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
dist.init_process_group(backend="gloo")
|
||||
fn()
|
||||
|
||||
return wrapped_fn
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
rank = dist.get_rank()
|
||||
if rank == 0:
|
||||
port = get_open_port()
|
||||
ip = "127.0.0.1"
|
||||
dist.broadcast_object_list([ip, port], src=0)
|
||||
else:
|
||||
recv = [None, None]
|
||||
dist.broadcast_object_list(recv, src=0)
|
||||
ip, port = recv # type: ignore
|
||||
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
|
||||
|
||||
for pg in [dist.group.WORLD, stateless_pg]:
|
||||
writer_rank = 2
|
||||
broadcaster = MessageQueue.create_from_process_group(
|
||||
pg, 40 * 1024, 2, writer_rank
|
||||
)
|
||||
if rank == writer_rank:
|
||||
seed = random.randint(0, 1000)
|
||||
dist.broadcast_object_list([seed], writer_rank)
|
||||
else:
|
||||
recv = [None]
|
||||
dist.broadcast_object_list(recv, writer_rank)
|
||||
seed = recv[0] # type: ignore
|
||||
|
||||
if pg == dist.group.WORLD:
|
||||
dist.barrier()
|
||||
else:
|
||||
pg.barrier()
|
||||
|
||||
# in case we find a race condition
|
||||
# print the seed so that we can reproduce the error
|
||||
print(f"Rank {rank} got seed {seed}")
|
||||
# test broadcasting with about 400MB of data
|
||||
N = 10_000
|
||||
if rank == writer_rank:
|
||||
arrs = get_arrays(N, seed)
|
||||
for x in arrs:
|
||||
broadcaster.broadcast_object(x)
|
||||
time.sleep(random.random() / 1000)
|
||||
else:
|
||||
arrs = get_arrays(N, seed)
|
||||
for x in arrs:
|
||||
y = broadcaster.broadcast_object(None)
|
||||
assert np.array_equal(x, y)
|
||||
time.sleep(random.random() / 1000)
|
||||
|
||||
if pg == dist.group.WORLD:
|
||||
dist.barrier()
|
||||
print(f"torch distributed passed the test! Rank {rank}")
|
||||
else:
|
||||
pg.barrier()
|
||||
print(f"StatelessProcessGroup passed the test! Rank {rank}")
|
||||
|
||||
|
||||
def test_shm_broadcast():
|
||||
distributed_run(worker_fn, 4)
|
||||
243
tests/distributed/test_shm_buffer.py
Normal file
243
tests/distributed/test_shm_buffer.py
Normal file
@@ -0,0 +1,243 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from vllm.distributed.device_communicators.shm_object_storage import (
|
||||
SingleWriterShmRingBuffer,
|
||||
)
|
||||
|
||||
|
||||
class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
"""Test suite for the ring buffer implementation"""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures"""
|
||||
self.buffer_size = 4096
|
||||
self.ring_buffer = None
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up after tests"""
|
||||
if self.ring_buffer:
|
||||
del self.ring_buffer
|
||||
|
||||
def test_buffer_opening(self):
|
||||
"""Test opening an existing buffer"""
|
||||
# First create a buffer
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=self.buffer_size, create=True
|
||||
)
|
||||
|
||||
# Then open it with another instance
|
||||
reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle())
|
||||
self.assertFalse(reader_buffer.is_writer)
|
||||
self.assertEqual(
|
||||
reader_buffer.shared_memory.name, self.ring_buffer.shared_memory.name
|
||||
)
|
||||
|
||||
def test_buffer_access(self):
|
||||
"""Test accessing allocated buffers"""
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=self.buffer_size, create=True
|
||||
)
|
||||
|
||||
size = 100
|
||||
address, monotonic_id = self.ring_buffer.allocate_buf(size)
|
||||
|
||||
# Write some test data
|
||||
test_data = b"Hello, World!" * 7 # 91 bytes
|
||||
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
|
||||
data_buf[0 : len(test_data)] = test_data
|
||||
|
||||
# Read it back
|
||||
with self.ring_buffer.access_buf(address) as (data_buf2, metadata2):
|
||||
read_data = bytes(data_buf2[0 : len(test_data)])
|
||||
read_id = metadata2[0]
|
||||
|
||||
self.assertEqual(read_data, test_data)
|
||||
self.assertEqual(read_id, monotonic_id)
|
||||
|
||||
def test_memory_error_on_full_buffer(self):
|
||||
"""Test that MemoryError is raised when buffer is full"""
|
||||
small_buffer_size = 200
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=small_buffer_size, create=True
|
||||
)
|
||||
|
||||
# Fill up the buffer
|
||||
self.ring_buffer.allocate_buf(100)
|
||||
self.ring_buffer.allocate_buf(80) # Total: 196 bytes used
|
||||
|
||||
# This should fail
|
||||
with self.assertRaises(MemoryError):
|
||||
self.ring_buffer.allocate_buf(1) # Would exceed buffer capacity
|
||||
|
||||
def test_allocation_and_free(self):
|
||||
"""Test allocation and freeing of buffers"""
|
||||
small_buffer_size = 200
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=small_buffer_size, create=True
|
||||
)
|
||||
|
||||
size = 80
|
||||
# Write some data
|
||||
test_data = b"Repeated test data"
|
||||
for i in range(5):
|
||||
address, monotonic_id = self.ring_buffer.allocate_buf(size)
|
||||
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
|
||||
data_buf[0:4] = (0).to_bytes(4, "little") # 0 for not in-use
|
||||
data_buf[4 : len(test_data) + 4] = test_data
|
||||
print(self.ring_buffer.metadata)
|
||||
freed_ids = self.ring_buffer.free_buf(lambda *args: True)
|
||||
print(f" Freed IDs: {freed_ids}")
|
||||
self.assertEqual(freed_ids[0], i)
|
||||
|
||||
def test_clear_buffer(self):
|
||||
"""Test clearing the buffer"""
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=self.buffer_size, create=True
|
||||
)
|
||||
|
||||
# Allocate some buffers
|
||||
for _ in range(3):
|
||||
self.ring_buffer.allocate_buf(100)
|
||||
|
||||
# Clear the buffer
|
||||
self.ring_buffer.clear()
|
||||
|
||||
# Check that metadata is empty and IDs reset
|
||||
self.assertEqual(len(self.ring_buffer.metadata), 0)
|
||||
self.assertEqual(self.ring_buffer.monotonic_id_start, 0)
|
||||
self.assertEqual(self.ring_buffer.monotonic_id_end, 0)
|
||||
self.assertEqual(self.ring_buffer.data_buffer_start, 0)
|
||||
self.assertEqual(self.ring_buffer.data_buffer_end, 0)
|
||||
|
||||
def test_allocation_cycles(self):
|
||||
buffer_size = 100
|
||||
ring = SingleWriterShmRingBuffer(data_buffer_size=buffer_size, create=True)
|
||||
|
||||
# tracking allocations for assertions
|
||||
allocated_bitmap = np.zeros(
|
||||
(buffer_size,), dtype=np.bool_
|
||||
) # addr -> is_allocated
|
||||
allocation_map = dict() # monotonic_id -> (addr, size)
|
||||
|
||||
def count_allocated(bitmap) -> int:
|
||||
return np.sum(bitmap).item()
|
||||
|
||||
def is_free_fn(a, b) -> bool:
|
||||
return True
|
||||
|
||||
def mark_allocated_with_assertion(id, addr, size):
|
||||
addr = addr % buffer_size
|
||||
self.assertEqual(count_allocated(allocated_bitmap[addr : addr + size]), 0)
|
||||
|
||||
allocated_bitmap[addr : addr + size] = True
|
||||
allocation_map[id] = (addr, size)
|
||||
|
||||
def mark_freed_with_assertion(id):
|
||||
self.assertTrue(id in allocation_map)
|
||||
|
||||
addr, size = allocation_map.pop(id)
|
||||
addr = addr % buffer_size
|
||||
self.assertEqual(
|
||||
count_allocated(allocated_bitmap[addr : addr + size]), size
|
||||
)
|
||||
|
||||
allocated_bitmap[addr : addr + size] = False
|
||||
|
||||
def ring_free(free_size=None):
|
||||
freed_ids = ring.free_buf(is_free_fn, free_size)
|
||||
for freed_id in freed_ids:
|
||||
mark_freed_with_assertion(freed_id)
|
||||
|
||||
def ring_allocate(allocate_size):
|
||||
allocate_size_with_md = allocate_size + ring.MD_SIZE
|
||||
try:
|
||||
addr, monotonic_id = ring.allocate_buf(allocate_size)
|
||||
mark_allocated_with_assertion(monotonic_id, addr, allocate_size_with_md)
|
||||
except MemoryError:
|
||||
# free 2x size for enough space if wrapping happened
|
||||
ring_free(allocate_size_with_md * 2)
|
||||
|
||||
# retry allocating
|
||||
addr, monotonic_id = ring.allocate_buf(allocate_size)
|
||||
mark_allocated_with_assertion(monotonic_id, addr, allocate_size_with_md)
|
||||
|
||||
# 1. allocation & free cycles
|
||||
for _ in range(33):
|
||||
# will consume 2 + 8 = 10 bytes per allocation
|
||||
ring_allocate(2)
|
||||
|
||||
# 2. free all allocations
|
||||
ring_free()
|
||||
|
||||
# 3. try allocate the largest possible buffer
|
||||
ring_allocate(buffer_size - ring.MD_SIZE)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function demonstrating usage and running tests"""
|
||||
print("=== SingleWriterShmRingBuffer Test Suite ===\n")
|
||||
|
||||
# Run unit tests
|
||||
print("Running unit tests...")
|
||||
unittest.main(argv=[""], exit=False, verbosity=2)
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("=== Manual Demo ===\n")
|
||||
|
||||
# Manual demonstration
|
||||
try:
|
||||
print("Creating ring buffer...")
|
||||
writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048, create=True)
|
||||
reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle())
|
||||
|
||||
print(f"Buffer created with name: {writer_buffer.shared_memory.name}")
|
||||
|
||||
# Allocate some buffers
|
||||
print("\nAllocating buffers...")
|
||||
address_array = []
|
||||
for i in range(3):
|
||||
size = 100 + i * 50
|
||||
try:
|
||||
writer_buffer.free_buf(lambda *args: True)
|
||||
address, monotonic_id = writer_buffer.allocate_buf(size)
|
||||
address_array.append((address, size, monotonic_id))
|
||||
|
||||
# Write some test data
|
||||
with writer_buffer.access_buf(address) as (data_buf, metadata):
|
||||
test_message = f"Test message {i}".encode()
|
||||
data_buf[0 : len(test_message)] = test_message
|
||||
|
||||
except MemoryError as e:
|
||||
print(f" Failed to allocate {size} bytes: {e}")
|
||||
|
||||
print("\nBuffer state:")
|
||||
print(f" Data buffer start: {writer_buffer.data_buffer_start}")
|
||||
print(f" Data buffer end: {writer_buffer.data_buffer_end}")
|
||||
print(f" Monotonic ID start: {writer_buffer.monotonic_id_start}")
|
||||
print(f" Monotonic ID end: {writer_buffer.monotonic_id_end}")
|
||||
print(f" Metadata entries: {len(writer_buffer.metadata)}")
|
||||
|
||||
# Try to read back the data
|
||||
print("\nReading back data...")
|
||||
for address, size, monotonic_id in address_array:
|
||||
with reader_buffer.access_buf(address) as (data_buf, metadata):
|
||||
# Find null terminator or read first 50 chars
|
||||
data_bytes = bytes(data_buf[0:size])
|
||||
message = data_bytes.decode()
|
||||
print(f" ID {monotonic_id}: '{message}'")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Demo error: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n=== Demo Complete ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
327
tests/distributed/test_shm_storage.py
Normal file
327
tests/distributed/test_shm_storage.py
Normal file
@@ -0,0 +1,327 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import multiprocessing
|
||||
import random
|
||||
import time
|
||||
import traceback
|
||||
import unittest
|
||||
from multiprocessing import Lock
|
||||
|
||||
import torch
|
||||
|
||||
# Assuming these are imported from your module
|
||||
from vllm.distributed.device_communicators.shm_object_storage import (
|
||||
MsgpackSerde,
|
||||
SingleWriterShmObjectStorage,
|
||||
SingleWriterShmRingBuffer,
|
||||
)
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFieldElem,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalSharedField,
|
||||
)
|
||||
|
||||
|
||||
def _dummy_elem(modality: str, key: str, size: int):
|
||||
return MultiModalFieldElem(
|
||||
modality=modality,
|
||||
key=key,
|
||||
data=torch.empty((size,), dtype=torch.int8),
|
||||
field=MultiModalSharedField(batch_size=1),
|
||||
)
|
||||
|
||||
|
||||
def _dummy_item(modality: str, size_by_key: dict[str, int]):
|
||||
return MultiModalKwargsItem.from_elems(
|
||||
[_dummy_elem(modality, key, size) for key, size in size_by_key.items()]
|
||||
)
|
||||
|
||||
|
||||
class TestSingleWriterShmObjectStorage(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Set up test fixtures before each test method."""
|
||||
ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=1024 * 100,
|
||||
create=True, # 10 MB buffer
|
||||
)
|
||||
self.storage = SingleWriterShmObjectStorage(
|
||||
max_object_size=1024 * 10, # 10KB max object
|
||||
n_readers=2,
|
||||
ring_buffer=ring_buffer,
|
||||
serde_class=MsgpackSerde,
|
||||
reader_lock=Lock(),
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up after each test."""
|
||||
if self.storage:
|
||||
del self.storage
|
||||
|
||||
def test_minimal_put_get_cycle(self):
|
||||
"""Test basic put and get operations."""
|
||||
key = "test_key"
|
||||
value = _dummy_item("text", {"field1": 10, "field2": 20})
|
||||
|
||||
# Put operation
|
||||
address, monotonic_id = self.storage.put(key, value)
|
||||
|
||||
# Verify key is in index
|
||||
self.assertIn(key, self.storage.key_index)
|
||||
self.assertEqual(self.storage.key_index[key], (address, monotonic_id))
|
||||
self.assertEqual(self.storage.id_index[monotonic_id], key)
|
||||
|
||||
# Get operation
|
||||
result = self.storage.get(address, monotonic_id)
|
||||
|
||||
# Verify result
|
||||
self.assertEqual(result, value)
|
||||
|
||||
def test_put_same_key_twice(self):
|
||||
"""Test behavior when putting the same key multiple times."""
|
||||
key = "duplicate_key"
|
||||
value1 = "first value"
|
||||
value2 = "second value"
|
||||
|
||||
# First put
|
||||
address1, id1 = self.storage.put(key, value1)
|
||||
retrieved1 = self.storage.get(address1, id1)
|
||||
self.assertEqual(retrieved1, value1)
|
||||
|
||||
# should raise an error on second put
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.storage.put(key, value2)
|
||||
|
||||
self.assertIn("already exists in the storage", str(context.exception))
|
||||
|
||||
def test_large_object_rejection(self):
|
||||
"""Test that objects exceeding max_object_size are rejected."""
|
||||
# Create an object larger than max_object_size
|
||||
large_data = "x" * (self.storage.max_object_size + 100)
|
||||
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.storage.put("large_key", large_data)
|
||||
|
||||
self.assertIn("exceeds max object size", str(context.exception))
|
||||
|
||||
def test_buffer_overflow_and_cleanup(self):
|
||||
"""Test behavior when buffer fills up and needs cleanup."""
|
||||
# Fill up the buffer with many small objects
|
||||
stored_items = []
|
||||
|
||||
try:
|
||||
for i in range(1000): # Try to store many items
|
||||
key = f"item_{i}"
|
||||
value = f"data_{i}" * 100 # Make it reasonably sized
|
||||
address, monotonic_id = self.storage.put(key, value)
|
||||
stored_items.append((key, value, address, monotonic_id))
|
||||
except MemoryError:
|
||||
print(f"Buffer filled after {len(stored_items)} items")
|
||||
|
||||
# Verify that some items are still accessible
|
||||
accessible_count = 0
|
||||
for key, original_value, address, monotonic_id in stored_items:
|
||||
for i in range(self.storage.n_readers):
|
||||
retrieved = self.storage.get(address, monotonic_id)
|
||||
if retrieved == original_value:
|
||||
accessible_count += 1
|
||||
|
||||
self.assertEqual(accessible_count, len(stored_items))
|
||||
|
||||
try:
|
||||
for i in range(len(stored_items), 1000): # Try to store many items
|
||||
key = f"item_{i}"
|
||||
value = f"data_{i}" * 100 # Make it reasonably sized
|
||||
address, monotonic_id = self.storage.put(key, value)
|
||||
stored_items.append((key, value, address, monotonic_id))
|
||||
except MemoryError:
|
||||
print(f"Buffer filled after {len(stored_items)} items")
|
||||
|
||||
# Verify that some items are still accessibles
|
||||
for key, original_value, address, monotonic_id in stored_items:
|
||||
try:
|
||||
for i in range(self.storage.n_readers):
|
||||
retrieved = self.storage.get(address, monotonic_id)
|
||||
if retrieved == original_value:
|
||||
accessible_count += 1
|
||||
except ValueError as e:
|
||||
print(f"Error retrieving {key}: {e}")
|
||||
|
||||
# some items from the first batch may still be accessible
|
||||
self.assertGreaterEqual(accessible_count, len(stored_items))
|
||||
|
||||
def test_blocking_unread_object(self):
|
||||
"""Test behavior when buffer fills up and needs cleanup."""
|
||||
# Fill up the buffer with many small objects
|
||||
stored_items = []
|
||||
|
||||
try:
|
||||
for i in range(1000): # Try to store many items
|
||||
key = f"item_{i}"
|
||||
value = f"data_{i}" * 100 # Make it reasonably sized
|
||||
address, monotonic_id = self.storage.put(key, value)
|
||||
stored_items.append((key, value, address, monotonic_id))
|
||||
except MemoryError:
|
||||
print(f"Buffer filled after {len(stored_items)} items")
|
||||
|
||||
# read all items except the first one
|
||||
# to simulate a blocking situation
|
||||
accessible_count = 0
|
||||
for key, original_value, address, monotonic_id in stored_items[1:]:
|
||||
for i in range(self.storage.n_readers):
|
||||
retrieved = self.storage.get(address, monotonic_id)
|
||||
if retrieved == original_value:
|
||||
accessible_count += 1
|
||||
|
||||
self.assertEqual(accessible_count, len(stored_items) - 1)
|
||||
|
||||
try:
|
||||
key = f"item_{len(stored_items)}"
|
||||
value = f"data_{len(stored_items)}" * 100
|
||||
address, monotonic_id = self.storage.put(key, value)
|
||||
except MemoryError:
|
||||
print(f"Buffer filled after {len(stored_items)} items")
|
||||
|
||||
# read the first item
|
||||
for i in range(self.storage.n_readers):
|
||||
key, original_value, address, monotonic_id = stored_items[0]
|
||||
retrieved = self.storage.get(address, monotonic_id)
|
||||
self.assertEqual(retrieved, original_value)
|
||||
|
||||
try:
|
||||
for i in range(len(stored_items), 1000): # Try to store many items
|
||||
key = f"item_{i}"
|
||||
value = f"data_{i}" * 100 # Make it reasonably sized
|
||||
address, monotonic_id = self.storage.put(key, value)
|
||||
stored_items.append((key, value, address, monotonic_id))
|
||||
except MemoryError:
|
||||
print(f"Buffer filled after {len(stored_items)} items")
|
||||
|
||||
# some items from the first batch may still be accessible
|
||||
self.assertGreaterEqual(len(stored_items), accessible_count + 10)
|
||||
|
||||
def test_invalid_get_operations(self):
|
||||
"""Test various invalid get operations."""
|
||||
# Test with non-existent address
|
||||
with self.assertRaises(ValueError): # Could be various exceptions
|
||||
self.storage.get(99999, 1)
|
||||
|
||||
# Store something first
|
||||
address, monotonic_id = self.storage.put("test", "value")
|
||||
|
||||
# Test with wrong monotonic_id
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.storage.get(address, monotonic_id + 100)
|
||||
|
||||
self.assertIn("has been modified or is invalid", str(context.exception))
|
||||
|
||||
def test_clear_storage(self):
|
||||
"""Test clearing the storage."""
|
||||
# Store some items
|
||||
for i in range(5):
|
||||
self.storage.put(f"item_{i}", f"value_{i}")
|
||||
|
||||
# Clear the storage
|
||||
self.storage.clear()
|
||||
|
||||
# Verify that all indices are empty
|
||||
self.assertEqual(len(self.storage.key_index), 0)
|
||||
self.assertEqual(len(self.storage.id_index), 0)
|
||||
self.assertEqual(len(self.storage.ring_buffer.metadata), 0)
|
||||
|
||||
# Verify that new items can be added after clearing
|
||||
address, monotonic_id = self.storage.put("new_item", "new_value")
|
||||
self.assertIn("new_item", self.storage.key_index)
|
||||
self.assertEqual((address, monotonic_id), (0, 0))
|
||||
|
||||
|
||||
# Reader process function
|
||||
def reader_process(process_id, storage_handle, items_to_read):
|
||||
"""Reader process that connects to existing shared memory and reads data."""
|
||||
reader_storage = SingleWriterShmObjectStorage.create_from_handle(storage_handle)
|
||||
|
||||
print(f"Reader {process_id} started")
|
||||
|
||||
errors = []
|
||||
|
||||
for key, original_value, address, monotonic_id in items_to_read:
|
||||
time.sleep(random.random() / 100)
|
||||
try:
|
||||
# Read data from shared memory
|
||||
retrieved_value = reader_storage.get(address, monotonic_id)
|
||||
|
||||
# Verify data integrity
|
||||
assert retrieved_value == original_value
|
||||
print(f"Reader {process_id} retrieved {key}: {retrieved_value}")
|
||||
except Exception as e:
|
||||
errors.append((key, str(e), type(e).__name__))
|
||||
|
||||
|
||||
def run_multiprocess_example():
|
||||
"""Run a minimal working example with real shared memory."""
|
||||
print("=== Minimal Object Storage Example ===")
|
||||
|
||||
try:
|
||||
# Create storage instance
|
||||
ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=1024 * 100,
|
||||
create=True, # 10 MB buffer
|
||||
)
|
||||
storage = SingleWriterShmObjectStorage(
|
||||
max_object_size=1024,
|
||||
n_readers=3,
|
||||
ring_buffer=ring_buffer,
|
||||
serde_class=MsgpackSerde,
|
||||
reader_lock=Lock(),
|
||||
)
|
||||
|
||||
print(f"Created storage (writer: {storage.is_writer})")
|
||||
|
||||
# Test basic data types
|
||||
test_data = [
|
||||
("user_data", {"name": "Alice", "age": 30, "scores": [95, 87, 92]}),
|
||||
("simple_string", "Hello, World!"),
|
||||
("number", 42),
|
||||
("list_data", [1, 2, 3, "four", 5.0]),
|
||||
]
|
||||
|
||||
stored_items = []
|
||||
|
||||
# Store all data
|
||||
for key, value in test_data:
|
||||
print(f"Storing {key}: {value}")
|
||||
address, monotonic_id = storage.put(key, value)
|
||||
stored_items.append((key, value, address, monotonic_id))
|
||||
print(f" -> Stored at address {address}, ID {monotonic_id}")
|
||||
|
||||
print("\n--- Retrieving Data ---")
|
||||
processes = []
|
||||
handle = storage.handle()
|
||||
# initialize lock for reader processes
|
||||
handle.reader_lock = Lock()
|
||||
for i in range(storage.n_readers):
|
||||
p = multiprocessing.Process(
|
||||
target=reader_process, args=(i, handle, stored_items)
|
||||
)
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join(timeout=10)
|
||||
if p.is_alive():
|
||||
p.terminate()
|
||||
p.join()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in minimal example: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the minimal example first
|
||||
run_multiprocess_example()
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
|
||||
# Run the test suite
|
||||
print("Running comprehensive test suite...")
|
||||
unittest.main(verbosity=2, exit=False)
|
||||
140
tests/distributed/test_symm_mem_allreduce.py
Normal file
140
tests/distributed/test_symm_mem_allreduce.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import queue
|
||||
import random
|
||||
import typing
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(44)
|
||||
|
||||
test_size_elements = 1024 * 1024
|
||||
|
||||
|
||||
def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
|
||||
monkeypatch = pytest.MonkeyPatch()
|
||||
config = VllmConfig(parallel_config=ParallelConfig(tensor_parallel_size=world_size))
|
||||
|
||||
with monkeypatch.context() as m, set_current_vllm_config(config):
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
dtype = torch.bfloat16
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
update_environment_variables(
|
||||
{
|
||||
"RANK": str(local_rank),
|
||||
"LOCAL_RANK": str(local_rank),
|
||||
"WORLD_SIZE": str(world_size),
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
}
|
||||
)
|
||||
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
cuda_communicator = typing.cast(
|
||||
CudaCommunicator, get_tp_group().device_communicator
|
||||
)
|
||||
symm_mem_comm = cuda_communicator.symm_mem_comm
|
||||
if symm_mem_comm is None or symm_mem_comm.disabled:
|
||||
# can't use skip under multiprocessing
|
||||
q.put("SymmMemCommunicator is not available or disabled.")
|
||||
return
|
||||
|
||||
inp_direct_symm_mem = torch.randint(
|
||||
1, 23, (test_size_elements,), dtype=dtype, device=device
|
||||
)
|
||||
if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
|
||||
# can't use skip under multiprocessing
|
||||
q.put("SymmMemCommunicator isn't used for this world and input size.")
|
||||
return
|
||||
|
||||
original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
|
||||
out_direct_symm_mem = symm_mem_comm.all_reduce(inp_direct_symm_mem)
|
||||
assert out_direct_symm_mem is not None
|
||||
|
||||
group = get_tp_group().device_group
|
||||
dist.all_reduce(original_inp_direct_symm_mem, group=group)
|
||||
torch.testing.assert_close(
|
||||
out_direct_symm_mem, original_inp_direct_symm_mem, atol=2.5, rtol=0.1
|
||||
)
|
||||
|
||||
# Test tensor_model_parallel_all_reduce which should use symm_mem
|
||||
inp_tensor_parallel = torch.randint(
|
||||
-23, 1, (test_size_elements,), dtype=dtype, device=device
|
||||
)
|
||||
original_inp_tensor_parallel = inp_tensor_parallel.clone()
|
||||
out_tensor_parallel = tensor_model_parallel_all_reduce(inp_tensor_parallel)
|
||||
dist.all_reduce(original_inp_tensor_parallel, group=group)
|
||||
torch.testing.assert_close(
|
||||
out_tensor_parallel, original_inp_tensor_parallel, atol=2.5, rtol=0.1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="SymmMemAllreduce is only available for CUDA platforms.",
|
||||
)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_symm_mem_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
|
||||
):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
q = mp.get_context("spawn").Queue()
|
||||
mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
|
||||
try:
|
||||
val = q.get(timeout=1)
|
||||
except queue.Empty:
|
||||
val = None
|
||||
finally:
|
||||
cleanup_dist_env_and_memory()
|
||||
if val is not None:
|
||||
pytest.skip(val)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="SymmMemAllreduce is only available for CUDA platforms.",
|
||||
)
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
|
||||
world_size = 4
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
# Verify that the DataParallel runs without error
|
||||
engine_args = EngineArgs(
|
||||
model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
data_parallel_size=2,
|
||||
tensor_parallel_size=2,
|
||||
data_parallel_backend="mp",
|
||||
)
|
||||
LLMEngine.from_engine_args(engine_args)
|
||||
69
tests/distributed/test_torchrun_example.py
Normal file
69
tests/distributed/test_torchrun_example.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# unit test for `examples/offline_inference/torchrun_example.py`
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
# Create prompts
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
cpu_group = get_world_group().cpu_group
|
||||
|
||||
torch_rank = dist.get_rank(group=cpu_group)
|
||||
|
||||
|
||||
def test_consistent_across_ranks(obj):
|
||||
if torch_rank == 0:
|
||||
dist.broadcast_object_list([obj], src=0, group=cpu_group)
|
||||
else:
|
||||
container = [None]
|
||||
dist.broadcast_object_list(container, src=0, group=cpu_group)
|
||||
assert container[0] == obj
|
||||
|
||||
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
|
||||
# make sure we can access the model parameters from the calling process
|
||||
# of the `LLM` instance.
|
||||
params = list(
|
||||
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
|
||||
)
|
||||
test_consistent_across_ranks(len(params))
|
||||
|
||||
# all ranks should have the same outputs
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
test_consistent_across_ranks(prompt)
|
||||
test_consistent_across_ranks(generated_text)
|
||||
print(f"Rank {torch_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
76
tests/distributed/test_torchrun_example_moe.py
Normal file
76
tests/distributed/test_torchrun_example_moe.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# unit test for `examples/offline_inference/torchrun_example.py`
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import get_tp_group, get_world_group
|
||||
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
# Create prompts
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
dp_size = int(os.getenv("DP_SIZE", "1"))
|
||||
dp_rank = int(os.getenv("DP_RANK", "0"))
|
||||
|
||||
if dp_size > 1:
|
||||
# distribute the prompts across the data parallel ranks
|
||||
prompts = [prompt for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-mini-MoE-instruct",
|
||||
tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
|
||||
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
group = get_world_group() if dp_size == 1 else get_tp_group()
|
||||
cpu_group = group.cpu_group
|
||||
group_rank = dist.get_rank(group=cpu_group)
|
||||
|
||||
|
||||
def test_consistent_across_ranks(obj):
|
||||
if group_rank == 0:
|
||||
dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
|
||||
else:
|
||||
container = [None]
|
||||
dist.broadcast_object_list(container, src=group.ranks[0], group=cpu_group)
|
||||
assert container[0] == obj
|
||||
|
||||
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
|
||||
# make sure we can access the model parameters from the calling process
|
||||
# of the `LLM` instance.
|
||||
params = list(
|
||||
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
|
||||
)
|
||||
test_consistent_across_ranks(len(params))
|
||||
|
||||
# all ranks should have the same outputs
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
test_consistent_across_ranks(prompt)
|
||||
test_consistent_across_ranks(generated_text)
|
||||
print(f"Rank {group_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
139
tests/distributed/test_utils.py
Normal file
139
tests/distributed/test_utils.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
|
||||
@ray.remote
|
||||
class _CUDADeviceCountStatelessTestActor:
|
||||
def get_count(self):
|
||||
return cuda_device_count_stateless()
|
||||
|
||||
def set_cuda_visible_devices(self, cuda_visible_devices: str):
|
||||
update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
|
||||
|
||||
def get_cuda_visible_devices(self):
|
||||
return envs.CUDA_VISIBLE_DEVICES
|
||||
|
||||
|
||||
def test_cuda_device_count_stateless():
|
||||
"""Test that cuda_device_count_stateless changes return value if
|
||||
CUDA_VISIBLE_DEVICES is changed."""
|
||||
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
|
||||
num_gpus=2
|
||||
).remote()
|
||||
assert len(sorted(ray.get(actor.get_cuda_visible_devices.remote()).split(","))) == 2
|
||||
assert ray.get(actor.get_count.remote()) == 2
|
||||
ray.get(actor.set_cuda_visible_devices.remote("0"))
|
||||
assert ray.get(actor.get_count.remote()) == 1
|
||||
ray.get(actor.set_cuda_visible_devices.remote(""))
|
||||
assert ray.get(actor.get_count.remote()) == 0
|
||||
|
||||
|
||||
def cpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port2, rank=rank, world_size=3
|
||||
)
|
||||
data = torch.tensor([rank])
|
||||
data = pg1.broadcast_obj(data, src=2)
|
||||
assert data.item() == 2
|
||||
if rank <= 2:
|
||||
data = torch.tensor([rank + 1])
|
||||
data = pg2.broadcast_obj(data, src=2)
|
||||
assert data.item() == 3
|
||||
pg2.barrier()
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
def gpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
torch.cuda.set_device(rank)
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
pynccl1 = PyNcclCommunicator(pg1, device=rank)
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port2, rank=rank, world_size=3
|
||||
)
|
||||
pynccl2 = PyNcclCommunicator(pg2, device=rank)
|
||||
data = torch.tensor([rank]).cuda()
|
||||
pynccl1.all_reduce(data)
|
||||
pg1.barrier()
|
||||
torch.cuda.synchronize()
|
||||
if rank <= 2:
|
||||
pynccl2.all_reduce(data)
|
||||
pg2.barrier()
|
||||
torch.cuda.synchronize()
|
||||
item = data[0].item()
|
||||
print(f"rank: {rank}, item: {item}")
|
||||
if rank == 3:
|
||||
assert item == 6
|
||||
else:
|
||||
assert item == 18
|
||||
|
||||
|
||||
def broadcast_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
if rank == 2:
|
||||
pg1.broadcast_obj("secret", src=2)
|
||||
else:
|
||||
obj = pg1.broadcast_obj(None, src=2)
|
||||
assert obj == "secret"
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
def allgather_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
data = pg1.all_gather_obj(rank)
|
||||
assert data == list(range(WORLD_SIZE))
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is flaky and prone to hang.")
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@pytest.mark.parametrize(
|
||||
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]
|
||||
)
|
||||
def test_stateless_process_group(worker):
|
||||
port1 = get_open_port()
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("", port1))
|
||||
port2 = get_open_port()
|
||||
WORLD_SIZE = 4
|
||||
from multiprocessing import get_context
|
||||
|
||||
ctx = get_context("fork")
|
||||
processes = []
|
||||
for i in range(WORLD_SIZE):
|
||||
rank = i
|
||||
processes.append(
|
||||
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))
|
||||
)
|
||||
for p in processes:
|
||||
p.start()
|
||||
for p in processes:
|
||||
p.join()
|
||||
for p in processes:
|
||||
assert not p.exitcode
|
||||
print("All processes finished.")
|
||||
Reference in New Issue
Block a user