[bugfix_v0.11.0]cancel tokenize for layerwise_proxy (#3913)

### What this PR does / why we need it?
cancel tokenize for layerwise_proxy
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
by ci

Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
wangxiaoteng888
2025-10-30 23:55:04 +08:00
committed by GitHub
parent af7a56550b
commit 38afd2c9cb
2 changed files with 1 additions and 8 deletions

View File

@@ -98,7 +98,6 @@ from typing import List
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer
from vllm.logger import init_logger
logger = init_logger(__name__)
@@ -153,8 +152,6 @@ class ProxyState:
heapq.heapify(self.decoder_heap)
self.req_id_future = {}
self.req_data_dict = {}
self.tokenizer = AutoTokenizer.from_pretrained(
global_args.tokenizer_dir)
def _update_prefiller_priority(self, server_idx: int):
"""Update the priority of a prefiller server in the heap."""
@@ -281,10 +278,6 @@ def parse_args():
nargs="+",
default=["localhost"])
parser.add_argument("--decoder-ports", type=int, nargs="+", default=[8002])
parser.add_argument("--tokenizer-dir",
type=str,
default="/mnt/weight/Qwen3-235B-A22B-W8A8",
help="Maximum number of retries for HTTP requests")
parser.add_argument("--max-retries",
type=int,
default=3,

View File

@@ -656,7 +656,7 @@ class MooncakeLayerwiseConnectorWorker:
self.device_id = device_ids[self.tp_rank] # type: ignore
if vllm_config.kv_transfer_config.get_from_extra_config(
'use_ascend_direct', True):
'use_ascend_direct', False):
hostname = self.side_channel_host
else:
hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"