[Bugfix] Fix multi-instance serving OOM on single card (#7427)

### What this PR does / why we need it? Fix https://github.com/vllm-project/vllm-ascend/issues/7308. Subtracting `init_non_torch_memory` (maybe used by the first instance) from the total `non_torch_memory` when calculating `available_kv_cache_memory`. Directly use `non_torch_memory_increase` (contained in `non_kv_cache_memory`) to calculate `available_kv_cache_memory`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Launch tow vllm-ascend instances sequentially on single card. ```bash # Launch first instance vllm serve /root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B \ --port 8100 \ --host 0.0.0.0 \ --additional-config='{"enable_cpu_binding":true}' \ --gpu-memory-utilization 0.3 \ --max-num-seqs 1 \ --max-model-len 2048 \ --max-num-batched-tokens 2048 \ --no-enable-prefix-caching \ --enforce-eager # Launch second instance vllm serve /root/.cache/modelscope/hub/models/Qwen/Qwen3-0.6B \ --port 8101 \ --host 0.0.0.0 \ --additional-config='{"enable_cpu_binding":true}' \ --gpu-memory-utilization 0.3 \ --max-num-seqs 1 \ --max-model-len 2048 \ --max-num-batched-tokens 2048 \ --no-enable-prefix-caching \ --enforce-eager ``` **Before this PR:** ```bash # First instance: ------------------------------------------------------------------ requested_memory: 18.287109375 GiB non_kv_cache_memory: 1.2340388298034668 GiB init_non_torch_memory: 0.3616676330566406 GiB non_torch_memory_before_empty_cache: 0.3896217346191406 GiB non_torch_memory_increase: 0.0279541015625 GiB non_torch_memory_cleared_by_empty_cache: 0.3616676330566406 GiB ------------------------------------------------------------------ # Second instance: ------------------------------------------------------------------ requested_memory: 18.287109375 GiB non_kv_cache_memory: 1.2336344718933105 GiB init_non_torch_memory: 18.37220001220703 GiB non_torch_memory_before_empty_cache: 18.399906158447266 GiB non_torch_memory_increase: 0.02754974365234375 GiB non_torch_memory_cleared_by_empty_cache: 18.372356414794922 GiB ------------------------------------------------------------------ # available_kv_cache_memory = requested_memory - non_kv_cache_memory - non_torch_memory_cleared_by_empty_cache Available KV cache memory: -1.32 GiB ``` **After this PR:** ```bash # First instance: ------------------------------------------------------------------ requested_memory: 18.287109375 GiB non_kv_cache_memory: 1.2340540885925293 GiB init_non_torch_memory: 0.36182403564453125 GiB non_torch_memory_before_empty_cache: 0.38979339599609375 GiB non_torch_memory_increase: 0.0279693603515625 GiB non_torch_memory_cleared_by_empty_cache: 0.0 GiB ------------------------------------------------------------------ # Second instance: ------------------------------------------------------------------ requested_memory: 18.287109375 GiB non_kv_cache_memory: 1.233344554901123 GiB init_non_torch_memory: 18.74309539794922 GiB non_torch_memory_before_empty_cache: 18.770355224609375 GiB non_torch_memory_increase: 0.02725982666015625 GiB non_torch_memory_cleared_by_empty_cache: 0.0 GiB ------------------------------------------------------------------ # available_kv_cache_memory = requested_memory - non_kv_cache_memory - non_torch_memory_cleared_by_empty_cache Available KV cache memory: 17.05 GiB ``` - vLLM version: v0.17.0 - vLLM main: 4497431df6 --------- Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com>
2026-03-23 14:22:59 +08:00
parent 44ef9a36ac
commit 5c0d02f689
5 changed files with 348 additions and 14 deletions
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -341,13 +341,6 @@ class NPUWorker(WorkerBase):
            weights_memory=int(self.model_runner.model_memory_usage),
        ) as profile_result:
            self.model_runner.profile_run()
-            free_memory, total_memory = torch.npu.mem_get_info()
-            torch_memory = torch.npu.memory_reserved()
-            non_torch_memory_before_empty_cache = total_memory - free_memory - torch_memory
-
-        self.non_torch_memory = profile_result.non_torch_increase
-        self.peak_activation_memory = profile_result.torch_peak_increase
-        non_torch_memory_cleared_by_empty_cache = non_torch_memory_before_empty_cache - self.non_torch_memory

        free_gpu_memory = profile_result.after_profile.free_memory
        assert self.init_snapshot.free_memory > free_gpu_memory, (
@@ -359,16 +352,12 @@ class NPUWorker(WorkerBase):
            "To fix this, ensure consistent GPU memory allocation or "
            "isolate vLLM in its own container."
        )
-        self.available_kv_cache_memory_bytes = (
-            self.requested_memory - profile_result.non_kv_cache_memory - non_torch_memory_cleared_by_empty_cache
-        )
-
+        self.available_kv_cache_memory_bytes = self.requested_memory - profile_result.non_kv_cache_memory
        logger.debug(profile_result)
        logger.info_once(
-            "Available KV cache memory: %.2f GiB",
-            GiB(self.available_kv_cache_memory_bytes),
-            scope="local",
+            "Available KV cache memory: %.2f GiB", GiB(self.available_kv_cache_memory_bytes), scope="local"
        )
+
        return int(self.available_kv_cache_memory_bytes)

    def execute_model(