[1/N][CI] Add multi node test (#3359)
### What this PR does / why we need it? This pr purpose to add multi-node test, on the first step, add `deepseek-v3` dp+tp+ep test ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
68
tests/e2e/multi_node/config/utils.py
Normal file
68
tests/e2e/multi_node/config/utils.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
def get_leader_ip():
|
||||
leader_dns = os.getenv("LWS_LEADER_ADDRESS")
|
||||
assert leader_dns is not None, "cannot find leader address"
|
||||
return socket.gethostbyname(leader_dns)
|
||||
|
||||
|
||||
def get_avaliable_port(start_port: int = 6000, end_port: int = 7000) -> int:
|
||||
import socket
|
||||
for port in range(start_port, end_port):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
try:
|
||||
s.bind(("", port))
|
||||
return port
|
||||
except OSError:
|
||||
continue
|
||||
raise RuntimeError("No available port found")
|
||||
|
||||
|
||||
def get_net_interface(ip: Optional[str] = None) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Returns specified IP and its network interface.
|
||||
If no IP is provided, uses the first from hostname -I.
|
||||
"""
|
||||
if ip is None:
|
||||
ips = subprocess.check_output(["hostname",
|
||||
"-I"]).decode().strip().split()
|
||||
if not ips:
|
||||
return None
|
||||
ip = ips[0]
|
||||
|
||||
for iface, addrs in psutil.net_if_addrs().items():
|
||||
for addr in addrs:
|
||||
if addr.family == socket.AF_INET and addr.address == ip:
|
||||
return ip, iface
|
||||
return None
|
||||
|
||||
|
||||
def get_default_envs() -> dict[str, str]:
|
||||
"""Returns default network and system environment variables."""
|
||||
result = get_net_interface()
|
||||
if result is None:
|
||||
raise RuntimeError("Failed to get default network IP and interface")
|
||||
ip, nic_name = result
|
||||
|
||||
return {
|
||||
"HCCL_IF_IP": ip,
|
||||
"GLOO_SOCKET_IFNAME": nic_name,
|
||||
"TP_SOCKET_IFNAME": nic_name,
|
||||
"HCCL_SOCKET_IFNAME": nic_name,
|
||||
"OMP_PROC_BIND": "false",
|
||||
"OMP_NUM_THREADS": "100",
|
||||
"VLLM_USE_V1": "1",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"VLLM_USE_MODELSCOPE": "true",
|
||||
"NUMEXPR_MAX_THREADS": "100",
|
||||
}
|
||||
|
||||
|
||||
def generate_ranktable():
|
||||
pass
|
||||
Reference in New Issue
Block a user