xc-llm-ascend/tests/ut/worker/test_worker_multi_instance.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from unittest.mock import MagicMock, patch

from vllm.utils.mem_constants import GiB_bytes

from tests.ut.base import TestBase


class TestDetermineAvailableMemoryMultiInstance(TestBase):
    """Tests for determine_available_memory() focusing on the multi-instance
    OOM regression (PR #7427)."""

    # ------------------------------------------------------------------ #
    # Helpers
    # ------------------------------------------------------------------ #

    def _make_worker(
        self,
        requested_memory: int,
        init_free_memory: int,
        init_total_memory: int,
        model_memory_usage: int | None = None,
    ):
        """Return a minimally-configured NPUWorker mock with memory state set."""
        from vllm_ascend.worker.worker import NPUWorker

        if model_memory_usage is None:
            model_memory_usage = int(0.5 * GiB_bytes)  # Qwen3-0.6B ~0.5 GiB

        with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
            worker = NPUWorker()

        worker.model_runner = MagicMock()
        worker.model_runner.model_memory_usage = model_memory_usage

        mock_snapshot = MagicMock()
        mock_snapshot.free_memory = init_free_memory
        mock_snapshot.total_memory = init_total_memory
        worker.init_snapshot = mock_snapshot

        worker.requested_memory = requested_memory
        return worker

    @staticmethod
    def _make_profile_result(free_memory_after: int, non_kv_cache_memory: int):
        """Return a mock profile_result compatible with memory_profiling output."""
        profile_result = MagicMock()
        profile_result.after_profile.free_memory = free_memory_after
        profile_result.non_kv_cache_memory = non_kv_cache_memory
        return profile_result

    @staticmethod
    def _patch_memory_profiling(profile_result):
        """Return a mock for `memory_profiling` that yields *profile_result*."""
        mock_ctx = MagicMock()
        mock_ctx.__enter__ = MagicMock(return_value=profile_result)
        mock_ctx.__exit__ = MagicMock(return_value=False)
        mock_profiling = MagicMock(return_value=mock_ctx)
        return patch("vllm_ascend.worker.worker.memory_profiling", mock_profiling)

    # ------------------------------------------------------------------ #
    # Tests
    # ------------------------------------------------------------------ #

    @patch("vllm_ascend.worker.worker.logger")
    def test_single_instance_positive_kv_cache(self, mock_logger):
        """Baseline: single instance on an empty card yields positive KV cache."""
        total = int(64 * GiB_bytes)
        gpu_util = 0.9
        requested_memory = int(total * gpu_util)   # 57.6 GiB
        init_free = int(62 * GiB_bytes)            # almost all free
        non_kv_cache = int(0.5 * GiB_bytes)        # Qwen3-0.6B weights

        worker = self._make_worker(requested_memory, init_free, total)
        profile_result = self._make_profile_result(
            free_memory_after=init_free - non_kv_cache,
            non_kv_cache_memory=non_kv_cache,
        )

        with self._patch_memory_profiling(profile_result):
            result = worker.determine_available_memory()

        expected = requested_memory - non_kv_cache
        self.assertEqual(result, expected)
        self.assertGreater(result, 0)

    @patch("vllm_ascend.worker.worker.logger")
    def test_second_instance_on_same_card_positive_kv_cache(self, mock_logger):
        """
        Regression test for PR #7427.

        Scenario (64 GiB Ascend 910B card, two Qwen3-0.6B instances,
        gpu_memory_utilization=0.4):

          ┌───────────────────────────────────────────────────────────────┐
          │ Card total:  64 GiB                                           │
          │ Instance 1:  requested_memory = 64 * 0.4 = 25.6 GiB (in use) │
          │ Instance 2 start: init_snapshot.free_memory ≈ 38.4 GiB       │
          │ Instance 2: requested_memory = 25.6 GiB                      │
          │ Profiling (fixed):  non_kv_cache_memory = 0.5 GiB (weights)  │
          │ available = 25.6 - 0.5 = 25.1 GiB  → must be > 0  ✓         │
          └───────────────────────────────────────────────────────────────┘

        Before the fix, non_kv_cache_memory was inflated to include first
        instance memory (~25.6 GiB), yielding available ≈ -1.32 GiB (OOM).
        """
        total = int(64 * GiB_bytes)
        gpu_util = 0.4
        requested_memory = int(total * gpu_util)          # 25.6 GiB

        # First instance already occupies its full requested_memory slice
        first_instance_used = requested_memory            # 25.6 GiB
        init_free = total - first_instance_used           # ~38.4 GiB

        # After the fix: profiling correctly reports only the second
        # instance's own model weights, not the first instance's memory.
        non_kv_cache = int(0.5 * GiB_bytes)              # Qwen3-0.6B weights

        worker = self._make_worker(requested_memory, init_free, total)
        profile_result = self._make_profile_result(
            free_memory_after=init_free - non_kv_cache,
            non_kv_cache_memory=non_kv_cache,
        )

        with self._patch_memory_profiling(profile_result):
            result = worker.determine_available_memory()

        self.assertGreater(
            result, 0,
            "Second instance must have positive KV cache memory. "
            "A non-positive value means the multi-instance OOM bug "
            "(PR #7427) has regressed.",
        )
        expected = requested_memory - non_kv_cache
        self.assertEqual(result, expected)
        # Verify model_runner.profile_run() was called during profiling
        worker.model_runner.profile_run.assert_called_once()

    @patch("vllm_ascend.worker.worker.logger")
    def test_second_instance_buggy_non_kv_cache_gives_negative(self, mock_logger):
        """
        Documents the *pre-fix* buggy behaviour that PR #7427 addresses.

        When non_kv_cache_memory is erroneously inflated to include memory
        already held by the first instance (~25.6 GiB extra), the formula
            available = requested_memory - non_kv_cache_memory
        yields a negative value, confirming why the fix was necessary.

        This test is intentionally asserting the *negative* outcome to
        document the regressed state; it is NOT testing the fix itself.
        """
        total = int(64 * GiB_bytes)
        gpu_util = 0.4
        requested_memory = int(total * gpu_util)   # 25.6 GiB

        first_instance_used = requested_memory     # 25.6 GiB
        init_free = total - first_instance_used    # ~38.4 GiB

        # Buggy: non_kv_cache_memory = first-instance memory + second-instance weights
        buggy_non_kv_cache = int((25.6 + 0.5) * GiB_bytes)  # ~26.1 GiB

        worker = self._make_worker(requested_memory, init_free, total)
        profile_result = self._make_profile_result(
            # free_memory decreased only by the actual new allocation (weights)
            free_memory_after=init_free - int(0.5 * GiB_bytes),
            non_kv_cache_memory=buggy_non_kv_cache,
        )

        with self._patch_memory_profiling(profile_result):
            result = worker.determine_available_memory()

        # Pre-fix: 25.6 GiB - 26.1 GiB = -0.5 GiB  (negative → OOM)
        self.assertLess(
            result, 0,
            "With the pre-fix (buggy) non_kv_cache_memory the result must be "
            "negative; this documents the OOM regression that PR #7427 fixed.",
        )

    @patch("vllm_ascend.worker.worker.logger")
    def test_assert_raises_when_free_memory_increases_after_profile(self, mock_logger):
        """
        determine_available_memory() must raise AssertionError when free memory
        after profiling is greater than before (external process released memory
        during profiling, invalidating the measurement).
        """
        total = int(64 * GiB_bytes)
        requested_memory = int(total * 0.9)
        init_free = int(60 * GiB_bytes)

        worker = self._make_worker(requested_memory, init_free, total)
        # Abnormal: free memory increased after profiling
        profile_result = self._make_profile_result(
            free_memory_after=init_free + int(1 * GiB_bytes),  # went UP
            non_kv_cache_memory=int(0.5 * GiB_bytes),
        )

        with self._patch_memory_profiling(profile_result):
            with self.assertRaises(AssertionError) as ctx:
                worker.determine_available_memory()

        self.assertIn("Error in memory profiling", str(ctx.exception))

    @patch("vllm_ascend.worker.worker.logger")
    def test_second_instance_tight_memory_still_positive(self, mock_logger):
        """
        Edge case: card is almost full when second instance starts.

        Even with very little free memory left, as long as requested_memory >
        non_kv_cache_memory (i.e. there is room for at least some KV blocks),
        the result must be positive.
        """
        total = int(32 * GiB_bytes)       # smaller card (e.g. 910B1)
        gpu_util = 0.3
        requested_memory = int(total * gpu_util)   # 9.6 GiB

        # First instance has consumed most of its requested slice
        first_instance_used = requested_memory     # 9.6 GiB
        init_free = total - first_instance_used    # 22.4 GiB

        non_kv_cache = int(0.5 * GiB_bytes)  # Qwen3-0.6B

        worker = self._make_worker(requested_memory, init_free, total)
        profile_result = self._make_profile_result(
            free_memory_after=init_free - non_kv_cache,
            non_kv_cache_memory=non_kv_cache,
        )

        with self._patch_memory_profiling(profile_result):
            result = worker.determine_available_memory()

        self.assertGreater(result, 0)
        self.assertEqual(result, requested_memory - non_kv_cache)