diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py index ea0c5be..67c34ee 100644 --- a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py +++ b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py @@ -98,7 +98,6 @@ from typing import List import httpx from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse -from transformers import AutoTokenizer from vllm.logger import init_logger logger = init_logger(__name__) @@ -153,8 +152,6 @@ class ProxyState: heapq.heapify(self.decoder_heap) self.req_id_future = {} self.req_data_dict = {} - self.tokenizer = AutoTokenizer.from_pretrained( - global_args.tokenizer_dir) def _update_prefiller_priority(self, server_idx: int): """Update the priority of a prefiller server in the heap.""" @@ -281,10 +278,6 @@ def parse_args(): nargs="+", default=["localhost"]) parser.add_argument("--decoder-ports", type=int, nargs="+", default=[8002]) - parser.add_argument("--tokenizer-dir", - type=str, - default="/mnt/weight/Qwen3-235B-A22B-W8A8", - help="Maximum number of retries for HTTP requests") parser.add_argument("--max-retries", type=int, default=3, diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index e500f53..7f7c464 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -656,7 +656,7 @@ class MooncakeLayerwiseConnectorWorker: self.device_id = device_ids[self.tp_rank] # type: ignore if vllm_config.kv_transfer_config.get_from_extra_config( - 'use_ascend_direct', True): + 'use_ascend_direct', False): hostname = self.side_channel_host else: hostname = f"{self.side_channel_host}:0:npu_{self.device_id}"