From 2dac18afeaca01776ffe3b406f67ab7f1b4351f7 Mon Sep 17 00:00:00 2001 From: DreamerLeader <88812830+DreamerLeader@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:35:41 +0800 Subject: [PATCH] [Bugfix]Fix of Pooling Code and Update of Pooling Usage Guide (#6126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Fix of Pooling Code and Update of Pooling Usage Guide ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? pr:[[Bugfix]Fixed precision issues caused by pooled request pooling](https://github.com/vllm-project/vllm-ascend/pull/6049) readyhttps://github.com/vllm-project/vllm-ascend/pull/6049 read for review - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: 房建伟 Signed-off-by: fangjianwei Signed-off-by: DreamerLeader <88812830+DreamerLeader@users.noreply.github.com> Co-authored-by: 房建伟 Co-authored-by: fangjianwei --- docs/source/user_guide/feature_guide/kv_pool.md | 8 +++++++- .../kv_transfer/kv_pool/ascend_store/pool_scheduler.py | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/source/user_guide/feature_guide/kv_pool.md b/docs/source/user_guide/feature_guide/kv_pool.md index 63f3d26f..fe2c674c 100644 --- a/docs/source/user_guide/feature_guide/kv_pool.md +++ b/docs/source/user_guide/feature_guide/kv_pool.md @@ -305,7 +305,7 @@ Long question: curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_completion_tokens": 256, "temperature":0.0 }' ``` -### Colocation Scenario +### PD-Mixed Inference #### 1.Run Mixed Department Script @@ -363,3 +363,9 @@ Long question: ```shell curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_completion_tokens": 256, "temperature":0.0 }' ``` + +Note: For MooncakeStore, it is recommended to perform a warm-up phase before running actual performance benchmarks. + +This is because HCCL one-sided communication connections are created lazily after the instance is launched when Device-to-Device communication is involved. Currently, full-mesh connections between all devices are required. Establishing these connections introduces a one-time time overhead and persistent device memory consumption (4 MB of device memory per connection). + +**For warm-up, it is recommended to issue requests with an input sequence length of 8K and an output sequence length of 1, with the total number of requests being 2–3× the number of devices (cards/dies).** diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py index 51e7db70..8b802038 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/pool_scheduler.py @@ -159,6 +159,7 @@ class KVPoolScheduler: self._request_trackers.pop(finished_req_id, None) self._unfinished_requests.pop(finished_req_id, None) self._unfinished_request_ids.discard(finished_req_id) + self._preempted_req_ids.discard(finished_req_id) for req_id in scheduler_output.preempted_req_ids: self._preempted_req_ids.update(scheduler_output.preempted_req_ids)