[Cherry-pick] [0.11.0] pd proxy support ipv6 and fix proxy (#4242)

### What this PR does / why we need it?
pd proxy support ipv6, mooncake connector check whether the IPv6 address
is used and notify the user.

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
liziyu
2025-11-18 16:33:00 +08:00
committed by GitHub
parent 378e92a2a2
commit ddf3e75800
4 changed files with 29 additions and 5 deletions

View File

@@ -88,6 +88,7 @@ import argparse
import asyncio
import functools
import heapq
import ipaddress
import os
import sys
import threading
@@ -116,6 +117,12 @@ class ServerState:
self.host = host
self.port = port
self.url = f'http://{host}:{port}/v1'
try:
ip = ipaddress.ip_address(self.host)
if isinstance(ip, ipaddress.IPv6Address):
self.url = f'http://[{host}]:{port}/v1'
except Exception:
pass
self.client = httpx.AsyncClient(timeout=None,
base_url=self.url,
limits=httpx.Limits(
@@ -356,6 +363,9 @@ async def send_request_to_service(client: httpx.AsyncClient,
req_data = req_data.copy()
req_data["stream"] = False
req_data["max_tokens"] = 1
req_data["min_tokens"] = 1
if "max_completion_tokens" in req_data:
req_data["max_completion_tokens"] = 1
if "stream_options" in req_data:
del req_data["stream_options"]
headers = {

View File

@@ -88,6 +88,7 @@ import argparse
import asyncio
import functools
import heapq
import ipaddress
import json
import os
import sys
@@ -118,6 +119,12 @@ class ServerState:
self.host = host
self.port = port
self.url = f'http://{host}:{port}/v1'
try:
ip = ipaddress.ip_address(self.host)
if isinstance(ip, ipaddress.IPv6Address):
self.url = f'http://[{host}]:{port}/v1'
except Exception:
pass
self.client = httpx.AsyncClient(timeout=None,
base_url=self.url,
limits=httpx.Limits(
@@ -366,6 +373,8 @@ async def send_request_to_service(client: httpx.AsyncClient,
req_data["stream"] = False
req_data["max_tokens"] = 1
req_data["min_tokens"] = 1
if "max_completion_tokens" in req_data:
req_data["max_completion_tokens"] = 1
if "stream_options" in req_data:
del req_data["stream_options"]
headers = {

View File

@@ -1,3 +1,4 @@
import ipaddress
import threading
from typing import Optional
@@ -8,6 +9,15 @@ _global_te_lock = threading.Lock()
def get_global_te(hostname: str, device_name: Optional[str]):
try:
ip = ipaddress.ip_address(hostname)
if isinstance(ip, ipaddress.IPv6Address):
raise RuntimeError(
"The backend of mooncake's Ascend Direct Xfer Library currently does not support IPv6."
)
except ValueError:
pass
global _global_te
if _global_te is None:
with _global_te_lock:

View File

@@ -162,11 +162,6 @@ env_variables: Dict[str, Callable[[], Any]] = {
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
"MSMONITOR_USE_DAEMON":
lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),
# Timeout (in seconds) for delayed KVCache block release. In the prefill
# node, if a request is marked for delayed KV block release and the blocks
# are not freed within this timeout, they will be forcibly released.
"VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT":
lambda: int(os.getenv("VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT", 250)),
"VLLM_ASCEND_ENABLE_MLAPO":
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", '0'))),
# Whether to enable transpose weight and cast format to FRACTAL_NZ.