[main][Bugfix] Remove the ZMQ communication setup on the D node (#4926)
In the PD separation scenario, the D node does not need to perform get
operations, and therefore does not need to create ZeroMQ (ZMQ)
communication.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
@@ -55,7 +55,7 @@ class AscendStoreConnector(KVConnectorBase_V1):
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert self.connector_worker is not None
|
assert self.connector_worker is not None
|
||||||
if vllm_config.parallel_config.rank == 0:
|
if vllm_config.parallel_config.rank == 0 and self.kv_role != "kv_consumer":
|
||||||
self.lookup_server = LookupKeyServer(self.connector_worker,
|
self.lookup_server = LookupKeyServer(self.connector_worker,
|
||||||
vllm_config,
|
vllm_config,
|
||||||
self.use_layerwise)
|
self.use_layerwise)
|
||||||
|
|||||||
@@ -20,13 +20,14 @@ from vllm_ascend.distributed.kvpool.config_data import (
|
|||||||
class KVPoolScheduler:
|
class KVPoolScheduler:
|
||||||
|
|
||||||
def __init__(self, vllm_config: "VllmConfig", use_layerwise):
|
def __init__(self, vllm_config: "VllmConfig", use_layerwise):
|
||||||
self.client = LookupKeyClient(vllm_config)
|
|
||||||
self.use_layerwise = use_layerwise
|
self.use_layerwise = use_layerwise
|
||||||
self.kv_role = vllm_config.kv_transfer_config.kv_role
|
self.kv_role = vllm_config.kv_transfer_config.kv_role
|
||||||
self.consumer_is_to_load = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
|
self.consumer_is_to_load = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
|
||||||
"consumer_is_to_load", False)
|
"consumer_is_to_load", False)
|
||||||
self.load_async = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
|
self.load_async = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
|
||||||
"load_async", False)
|
"load_async", False)
|
||||||
|
self.client = LookupKeyClient(
|
||||||
|
vllm_config) if self.kv_role != "kv_consumer" else None
|
||||||
# request_id -> (vllm cached tokes, kvpool cached tokens)
|
# request_id -> (vllm cached tokes, kvpool cached tokens)
|
||||||
self.load_specs: dict[str, LoadSpec] = {}
|
self.load_specs: dict[str, LoadSpec] = {}
|
||||||
self.pcp_size = getattr(vllm_config.parallel_config,
|
self.pcp_size = getattr(vllm_config.parallel_config,
|
||||||
@@ -74,8 +75,8 @@ class KVPoolScheduler:
|
|||||||
else:
|
else:
|
||||||
token_len = len(request.prompt_token_ids)
|
token_len = len(request.prompt_token_ids)
|
||||||
|
|
||||||
num_external_hit_tokens = self.client.lookup(token_len,
|
num_external_hit_tokens = self.client.lookup( # type: ignore[union-attr]
|
||||||
request.block_hashes)
|
token_len, request.block_hashes)
|
||||||
|
|
||||||
if num_external_hit_tokens == request.num_tokens:
|
if num_external_hit_tokens == request.num_tokens:
|
||||||
num_external_hit_tokens -= 1
|
num_external_hit_tokens -= 1
|
||||||
|
|||||||
Reference in New Issue
Block a user