xc-llm-ascend/tests/ut/test_utils.py

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#

import math
import os
from threading import Lock
from unittest import mock

import pytest
import torch
from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig,
                         VllmConfig)

from tests.ut.base import TestBase
from vllm_ascend import utils
from vllm_ascend.utils import REGISTERED_ASCEND_OPS


class TestUtils(TestBase):

    def setUp(self):
        import importlib

        from vllm_ascend import platform
        importlib.reload(platform)

    def test_nd_to_nz_2d(self):
        # can be divided by 16
        input_tensor = torch.randn(32, 64)
        output = utils.nd_to_nz_2d(input_tensor)
        self.assertEqual(output.shape[0], 1)
        self.assertEqual(output.shape[1], 64 // 16)
        self.assertEqual(output.shape[2], 32)
        self.assertEqual(output.shape[3], 16)

        # cannot be divided by 16
        input_tensor = torch.randn(30, 62)
        output = utils.nd_to_nz_2d(input_tensor)
        self.assertEqual(output.shape[0], 1)
        self.assertEqual(output.shape[1], math.ceil(62 / 16))
        self.assertEqual(output.shape[2], 32)
        self.assertEqual(output.shape[3], 16)

        # pad to 16
        input_tensor = torch.randn(8, 12)
        output = utils.nd_to_nz_2d(input_tensor)
        self.assertEqual(output.shape[0], 1)
        self.assertEqual(output.shape[1], 1)  # 12->16, 16//16=1
        self.assertEqual(output.shape[2], 16)  # 8->16
        self.assertEqual(output.shape[3], 16)

        # check if the output is contiguous
        input_tensor = torch.randn(32, 64)
        output = utils.nd_to_nz_2d(input_tensor)
        self.assertTrue(output.is_contiguous())

        # check if the output values are preserved
        input_tensor = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
        output = utils.nd_to_nz_2d(input_tensor)
        expected = torch.tensor(
            [[[[1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]])
        self.assertTrue(torch.allclose(output, expected))

    def test_aligned_16(self):
        # align to 16
        input_tensor = torch.randn(15, 64)
        output_tensor = utils.aligned_16(input_tensor)
        self.assertEqual(output_tensor.shape[0], 16)

        # align to 16
        input_tensor = torch.randn(16, 64)
        output_tensor = utils.aligned_16(input_tensor)
        self.assertEqual(output_tensor.shape[0], 16)
        self.assertTrue(torch.equal(input_tensor, output_tensor))

        # align to 32
        input_tensor = torch.randn(17, 64)
        output_tensor = utils.aligned_16(input_tensor)
        self.assertEqual(output_tensor.shape[0], 32)

    @pytest.mark.skip(
        "Skip as register_kernels has NPU SocName checking in CANN 8.5.0.")
    def test_enable_custom_op(self):
        result = utils.enable_custom_op()
        self.assertTrue(result)

        utils._CUSTOM_OP_ENABLED = None

        with mock.patch('builtins.__import__') as mock_import_module:
            mock_import_module.side_effect = ImportError("import error")
            self.assertFalse(utils.enable_custom_op())

    def test_find_hccl_library(self):
        with mock.patch.dict(os.environ,
                             {"HCCL_SO_PATH": "/path/to/hccl/libhccl.so"}):
            self.assertEqual(utils.find_hccl_library(),
                             "/path/to/hccl/libhccl.so")
        with mock.patch("torch.version.cann", None):
            self.assertRaises(ValueError, utils.find_hccl_library)
        with mock.patch("torch.version.cann", "Ascend910"):
            self.assertEqual(utils.find_hccl_library(), "libhccl.so")

    def test_current_stream(self):
        with mock.patch("torch.npu.current_stream") as mock_current_stream:
            self.assertEqual(utils.current_stream(), mock_current_stream())

    def test_vllm_version_is(self):
        with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):
            with mock.patch("vllm.__version__", "1.0.0"):
                self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
                self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
            with mock.patch("vllm.__version__", "2.0.0"):
                self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
                self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
        with mock.patch("vllm.__version__", "1.0.0"):
            self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
            self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
        with mock.patch("vllm.__version__", "2.0.0"):
            self.assertTrue(utils.vllm_version_is.__wrapped__("2.0.0"))
            self.assertFalse(utils.vllm_version_is.__wrapped__("1.0.0"))
        # Test caching takes effect
        utils.vllm_version_is.cache_clear()
        utils.vllm_version_is("1.0.0")
        misses = utils.vllm_version_is.cache_info().misses
        hits = utils.vllm_version_is.cache_info().hits
        self.assertEqual(misses, 1)
        self.assertEqual(hits, 0)
        utils.vllm_version_is("1.0.0")
        hits = utils.vllm_version_is.cache_info().hits
        self.assertEqual(hits, 1)

    def test_get_max_hidden_layers(self):
        from transformers import PretrainedConfig

        class SimpleConfig(PretrainedConfig):

            def __init__(self, num_hidden_layers=12):
                self.num_hidden_layers = num_hidden_layers

            def to_dict(self):
                return {"num_hidden_layers": self.num_hidden_layers}

        self.assertEqual(utils.get_max_hidden_layers(SimpleConfig()), 12)
        self.assertEqual(utils.get_max_hidden_layers(SimpleConfig(24)), 24)

        class NestedConfig(PretrainedConfig):

            def to_dict(self):
                return {
                    "model": {
                        "encoder": {
                            "num_hidden_layers": 8
                        },
                        "decoder": {
                            "num_hidden_layers": 12
                        }
                    },
                    "other_setting": True
                }

        self.assertEqual(utils.get_max_hidden_layers(NestedConfig()), 12)

        class MultiValueConfig(PretrainedConfig):

            def to_dict(self):
                return {
                    "num_hidden_layers": 6,
                    "submodule": {
                        "num_hidden_layers": 18,
                        "subsub": {
                            "num_hidden_layers": 9
                        }
                    }
                }

        self.assertEqual(utils.get_max_hidden_layers(MultiValueConfig()), 18)

        class NoLayerConfig(PretrainedConfig):

            def to_dict(self):
                return {"attention_heads": 8}

        with self.assertRaises(ValueError) as context:
            utils.get_max_hidden_layers(NoLayerConfig())
        self.assertIn("num_hidden_layers", str(context.exception))

    def test_update_aclgraph_sizes(self):
        test_compilation_config = CompilationConfig(
            cudagraph_capture_sizes=[i for i in range(150)])
        model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
        test_model_config = ModelConfig(model=model_path, enforce_eager=True)
        test_parallel_config = ParallelConfig()
        test_vllm_config = VllmConfig(
            model_config=test_model_config,
            compilation_config=test_compilation_config,
            parallel_config=test_parallel_config)
        utils.update_aclgraph_sizes(test_vllm_config)
        os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
        utils.update_aclgraph_sizes(test_vllm_config)
        del os.environ['HCCL_OP_EXPANSION_MODE']

        self.assertEqual(
            0,
            len(test_vllm_config.compilation_config.cudagraph_capture_sizes))

    @mock.patch("vllm.model_executor.custom_op.CustomOp")
    @mock.patch("vllm_ascend.ops.activation.AscendQuickGELU")
    @mock.patch("vllm_ascend.ops.activation.AscendSiluAndMul")
    @mock.patch("vllm_ascend.ops.layernorm.AscendRMSNorm")
    def test_register_ascend_customop(self, mock_ascend_rmsnorm,
                                      mock_ascend_silu_and_mul,
                                      mock_ascend_quick_gelu, mock_customop):
        utils._ASCEND_CUSTOMOP_IS_REIGISTERED = False

        # ascend custom op is not registered
        utils.register_ascend_customop()
        self.assertEqual(mock_customop.register_oot.call_count,
                         len(REGISTERED_ASCEND_OPS))
        self.assertTrue(utils._ASCEND_CUSTOMOP_IS_REIGISTERED)

        # ascend custom op is already registered
        utils.register_ascend_customop()
        self.assertEqual(mock_customop.register_oot.call_count,
                         len(REGISTERED_ASCEND_OPS))

    @mock.patch("torch_npu.npu_format_cast")
    def test_maybe_trans_nz(self, mock_npu_format_cast):
        from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
        mock_npu_format_cast.side_effect = lambda weight, fmt: weight

        def assert_nz_cast(weight):
            mock_npu_format_cast.assert_called_once()
            args, kwargs = mock_npu_format_cast.call_args
            self.assertIs(args[0], weight)
            self.assertEqual(args[1], ACL_FORMAT_FRACTAL_NZ)
            self.assertEqual(kwargs, {})

        # Test case 1: non-310P, NZ is disabled
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=False),
        ):
            weight = torch.randn(32, 64, dtype=torch.float16)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            mock_npu_format_cast.assert_not_called()

        # Test case 2: 310P always converts non-fp32 weights, even when NZ=0
        mock_npu_format_cast.reset_mock()
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=True),
        ):
            weight = torch.randn(32, 64, dtype=torch.float16)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            assert_nz_cast(weight)

        # Test case 3: fp32 never converts, including on 310P
        mock_npu_format_cast.reset_mock()
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "1"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=True),
        ):
            weight = torch.randn(32, 64, dtype=torch.float32)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            mock_npu_format_cast.assert_not_called()

        # Test case 4: non-310P fp16 converts only when NZ=2
        mock_npu_format_cast.reset_mock()
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "1"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=False),
        ):
            weight = torch.randn(32, 64, dtype=torch.float16)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            mock_npu_format_cast.assert_not_called()

        # Test case 5: non-310P fp16 converts when NZ=2
        mock_npu_format_cast.reset_mock()
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "2"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=False),
        ):
            weight = torch.randn(32, 64, dtype=torch.float16)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            assert_nz_cast(weight)

        # Test case 6: non-310P bf16 converts when NZ=2
        mock_npu_format_cast.reset_mock()
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "2"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=False),
        ):
            weight = torch.randn(32, 64, dtype=torch.bfloat16)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            assert_nz_cast(weight)

        # Test case 7: non-310P quantized weights still convert by default
        mock_npu_format_cast.reset_mock()
        with (
            mock.patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "1"}),
            mock.patch("vllm_ascend.utils.is_310p", return_value=False),
        ):
            weight = torch.zeros(32, 64, dtype=torch.int8)
            result = utils.maybe_trans_nz(weight)
            self.assertIs(result, weight)
            assert_nz_cast(weight)