xc-llm-ascend/tests/ut/torchair/test_utils.py

import os
from concurrent.futures import ThreadPoolExecutor
from unittest import mock
from unittest.mock import MagicMock, patch

import torch

from tests.ut.base import TestBase
from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE
from vllm_ascend.torchair import utils


class TestTorchairUtils(TestBase):

    def test_get_torchair_current_work_dir(self):
        cache_dir = utils.TORCHAIR_CACHE_DIR
        work_dir = utils._get_torchair_current_work_dir()
        self.assertEqual(cache_dir, work_dir)
        work_dir = utils._get_torchair_current_work_dir("test")
        self.assertEqual(os.path.join(cache_dir, "test"), work_dir)

    def test_torchair_cache_dir(self):
        utils.write_kv_cache_bytes_to_file(0, 100)
        self.assertTrue(utils.check_torchair_cache_exist(),
                        "Create torchair cache dir failed")
        self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),
                        "Create kv cache bytes cache dir failed")
        kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)
        self.assertEqual(100, kv_cache_bytes)
        utils.delete_torchair_cache_file()
        self.assertFalse(utils.check_torchair_cache_exist(),
                         "Delete torchair cache dir failed")
        self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
                         "Delete kv cache bytes cache dir failed")

    def test_torchair_cache_dir_multiple_ranks(self):
        ranks = [0, 1, 2, 3]
        values = [100, 200, 300, 400]

        with ThreadPoolExecutor() as executor:
            executor.map(utils.write_kv_cache_bytes_to_file, ranks, values)
        for rank, expected in zip(ranks, values):
            self.assertEqual(expected,
                             utils.read_kv_cache_bytes_from_file(rank))
        utils.delete_torchair_cache_file()

        self.assertFalse(utils.check_torchair_cache_exist(),
                         "Delete torchair cache dir failed")
        self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
                         "Delete kv cache bytes cache dir failed")

    @patch('vllm.ModelRegistry')
    def test_register_torchair_model(self, mock_model_registry):
        mock_registry = MagicMock()
        mock_model_registry.return_value = mock_registry
        utils.register_torchair_model()

        self.assertEqual(mock_model_registry.register_model.call_count, 5)
        call_args_list = mock_model_registry.register_model.call_args_list

        expected_registrations = [
            ("DeepSeekMTPModel",
             "vllm_ascend.torchair.models.torchair_deepseek_mtp:TorchairDeepSeekMTP"
             ),
            ("DeepseekV2ForCausalLM",
             "vllm_ascend.torchair.models.torchair_deepseek_v2:TorchairDeepseekV2ForCausalLM"
             ),
            ("DeepseekV3ForCausalLM",
             "vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM"
             ),
            ("Qwen2ForCausalLM",
             "vllm_ascend.torchair.models.qwen2:CustomQwen2ForCausalLM"),
            ("Qwen3ForCausalLM",
             "vllm_ascend.torchair.models.qwen3_moe:CustomQwen3MoeForCausalLM")
        ]

        for i, (expected_name,
                expected_path) in enumerate(expected_registrations):
            args, kwargs = call_args_list[i]
            self.assertEqual(args[0], expected_name)
            self.assertEqual(args[1], expected_path)

    @mock.patch('torch_npu.get_npu_format')
    @mock.patch('torch_npu.npu_format_cast')
    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
                new=mock.MagicMock)
    def test_converting_weight_acl_format(self, mock_npu_cast,
                                          mock_get_format):
        ACL_FORMAT_FRACTAL_NZ = 29
        mock_get_format.return_value = 1
        mock_npu_cast.return_value = 1

        fused_moe = mock.MagicMock()
        fused_moe.w13_weight = mock.MagicMock()
        fused_moe.w2_weight = mock.MagicMock()
        fused_moe.w13_weight.data = torch.randn(128, 256)
        fused_moe.w2_weight.data = torch.randn(256, 128)
        model = mock.MagicMock()
        model.modules.return_value = [fused_moe]

        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
        self.assertEqual(fused_moe.w13_weight.data, 1)

    @mock.patch('torch_npu.get_npu_format')
    @mock.patch('torch_npu.npu_format_cast')
    @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',
                new=mock.MagicMock)
    def test_converting_weight_acl_format_format_true(self, mock_npu_cast,
                                                      mock_get_format):
        ACL_FORMAT_FRACTAL_NZ = 29
        mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ
        mock_npu_cast.return_value = 1

        fused_moe = mock.MagicMock()
        fused_moe.w13_weight = mock.MagicMock()
        fused_moe.w2_weight = mock.MagicMock()
        fused_moe.w13_weight.data = torch.randn(128, 256)
        fused_moe.w2_weight.data = torch.randn(256, 128)
        model = mock.MagicMock()
        model.modules.return_value = [fused_moe]

        utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)
        mock_npu_cast.assert_not_called()

    def test_torchair_quant_method_register(self):

        TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
            "W8A8_DYNAMIC"]
        TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[
            "W4A8_DYNAMIC"]
        utils.torchair_quant_method_register()
        self.assertNotEqual(TorchairW8A8DYNAMICQuantizer,
                            SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"])
        self.assertNotEqual(TorchairW4A8DYNAMICQuantizer,
                            SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"])
[2/4][Refactor] Refactor torchair utils (#1892) There is a lot torchair specified logic in common code. It results hard code maintenance. We will create a new torchair module to launch torchair related logic there. I plan to add 4 PR. 1. Refactor worker 2. Refactor utils (this PR) - simple change that move all torchair related util function to torchair module 3. Refactor model_runner 4. Refactor attention - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8188196a1c8af26134d8e366ebe564c18fb95379 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-21 19:43:30 +08:00			`import os`
[1/N][refactor] torchair deepseek modeling refactor (#2384) ### What this PR does / why we need it? Move torchair related model arch into torchair moduel to make the code clear. Next step we'll remove all torchair related code outside of torchair moduel. ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/08d5f7113a024818b2867782c2539794b7aa162b Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-18 15:00:37 +08:00			`from concurrent.futures import ThreadPoolExecutor`
[Refactor] cleanup converting_weight_acl_format_format (#2482) move maybe_converting_weight_acl_format_format to torchair module, it's only used with 310p+torchair - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/49ab23b3ccc2da9274c739d55f9b19206078c7a9 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-08-25 19:48:55 +08:00			`from unittest import mock`
[1/N][refactor] torchair deepseek modeling refactor (#2384) ### What this PR does / why we need it? Move torchair related model arch into torchair moduel to make the code clear. Next step we'll remove all torchair related code outside of torchair moduel. ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/08d5f7113a024818b2867782c2539794b7aa162b Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-18 15:00:37 +08:00			`from unittest.mock import MagicMock, patch`
[2/4][Refactor] Refactor torchair utils (#1892) There is a lot torchair specified logic in common code. It results hard code maintenance. We will create a new torchair module to launch torchair related logic there. I plan to add 4 PR. 1. Refactor worker 2. Refactor utils (this PR) - simple change that move all torchair related util function to torchair module 3. Refactor model_runner 4. Refactor attention - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8188196a1c8af26134d8e366ebe564c18fb95379 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-21 19:43:30 +08:00
[Refactor] cleanup converting_weight_acl_format_format (#2482) move maybe_converting_weight_acl_format_format to torchair module, it's only used with 310p+torchair - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/49ab23b3ccc2da9274c739d55f9b19206078c7a9 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-08-25 19:48:55 +08:00			`import torch`

[2/4][Refactor] Refactor torchair utils (#1892) There is a lot torchair specified logic in common code. It results hard code maintenance. We will create a new torchair module to launch torchair related logic there. I plan to add 4 PR. 1. Refactor worker 2. Refactor utils (this PR) - simple change that move all torchair related util function to torchair module 3. Refactor model_runner 4. Refactor attention - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8188196a1c8af26134d8e366ebe564c18fb95379 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-21 19:43:30 +08:00			`from tests.ut.base import TestBase`
[3/N][refactor] refactoer quantization (#2504) ### What this PR does / why we need it? Move torchair related qunatization section into torchair dir to make the code clear. Next step we'll remove all torchair related code outside of torchair quantization. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? vLLM version: main vLLM main: https://github.com/vllm-project/vllm/commit/ab9f2cfd1942f7ddfee658ce86ea96b4789862af - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/959783fb996d0d15598f45ca12ffcbee4b681424 Signed-off-by: hust17yixuan <303660421@qq.com> 2025-08-27 10:45:50 +08:00			`from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE`
[2/4][Refactor] Refactor torchair utils (#1892) There is a lot torchair specified logic in common code. It results hard code maintenance. We will create a new torchair module to launch torchair related logic there. I plan to add 4 PR. 1. Refactor worker 2. Refactor utils (this PR) - simple change that move all torchair related util function to torchair module 3. Refactor model_runner 4. Refactor attention - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/8188196a1c8af26134d8e366ebe564c18fb95379 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-07-21 19:43:30 +08:00			`from vllm_ascend.torchair import utils`


			`class TestTorchairUtils(TestBase):`

			`def test_get_torchair_current_work_dir(self):`
			`cache_dir = utils.TORCHAIR_CACHE_DIR`
			`work_dir = utils._get_torchair_current_work_dir()`
			`self.assertEqual(cache_dir, work_dir)`
			`work_dir = utils._get_torchair_current_work_dir("test")`
			`self.assertEqual(os.path.join(cache_dir, "test"), work_dir)`

			`def test_torchair_cache_dir(self):`
			`utils.write_kv_cache_bytes_to_file(0, 100)`
			`self.assertTrue(utils.check_torchair_cache_exist(),`
			`"Create torchair cache dir failed")`
			`self.assertTrue(utils.check_kv_cache_bytes_cache_exist(),`
			`"Create kv cache bytes cache dir failed")`
			`kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0)`
			`self.assertEqual(100, kv_cache_bytes)`
			`utils.delete_torchair_cache_file()`
			`self.assertFalse(utils.check_torchair_cache_exist(),`
			`"Delete torchair cache dir failed")`
			`self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),`
			`"Delete kv cache bytes cache dir failed")`
[1/N][refactor] torchair deepseek modeling refactor (#2384) ### What this PR does / why we need it? Move torchair related model arch into torchair moduel to make the code clear. Next step we'll remove all torchair related code outside of torchair moduel. ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/08d5f7113a024818b2867782c2539794b7aa162b Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-18 15:00:37 +08:00
			`def test_torchair_cache_dir_multiple_ranks(self):`
			`ranks = [0, 1, 2, 3]`
			`values = [100, 200, 300, 400]`

			`with ThreadPoolExecutor() as executor:`
			`executor.map(utils.write_kv_cache_bytes_to_file, ranks, values)`
			`for rank, expected in zip(ranks, values):`
			`self.assertEqual(expected,`
			`utils.read_kv_cache_bytes_from_file(rank))`
			`utils.delete_torchair_cache_file()`

			`self.assertFalse(utils.check_torchair_cache_exist(),`
			`"Delete torchair cache dir failed")`
			`self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),`
			`"Delete kv cache bytes cache dir failed")`

			`@patch('vllm.ModelRegistry')`
			`def test_register_torchair_model(self, mock_model_registry):`
			`mock_registry = MagicMock()`
			`mock_model_registry.return_value = mock_registry`
			`utils.register_torchair_model()`

[CI] Fix UT (#2452) Make UT CI happy - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d983769c41db224e0897fac2e9aefc5f57ad1122 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com> 2025-08-20 16:26:07 +08:00			`self.assertEqual(mock_model_registry.register_model.call_count, 5)`
[1/N][refactor] torchair deepseek modeling refactor (#2384) ### What this PR does / why we need it? Move torchair related model arch into torchair moduel to make the code clear. Next step we'll remove all torchair related code outside of torchair moduel. ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/08d5f7113a024818b2867782c2539794b7aa162b Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-18 15:00:37 +08:00			`call_args_list = mock_model_registry.register_model.call_args_list`

			`expected_registrations = [`
			`("DeepSeekMTPModel",`
			`"vllm_ascend.torchair.models.torchair_deepseek_mtp:TorchairDeepSeekMTP"`
			`),`
			`("DeepseekV2ForCausalLM",`
			`"vllm_ascend.torchair.models.torchair_deepseek_v2:TorchairDeepseekV2ForCausalLM"`
			`),`
			`("DeepseekV3ForCausalLM",`
			`"vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM"`
[CI] Fix UT (#2452) Make UT CI happy - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d983769c41db224e0897fac2e9aefc5f57ad1122 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com> 2025-08-20 16:26:07 +08:00			`),`
			`("Qwen2ForCausalLM",`
			`"vllm_ascend.torchair.models.qwen2:CustomQwen2ForCausalLM"),`
			`("Qwen3ForCausalLM",`
			`"vllm_ascend.torchair.models.qwen3_moe:CustomQwen3MoeForCausalLM")`
[1/N][refactor] torchair deepseek modeling refactor (#2384) ### What this PR does / why we need it? Move torchair related model arch into torchair moduel to make the code clear. Next step we'll remove all torchair related code outside of torchair moduel. ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/08d5f7113a024818b2867782c2539794b7aa162b Signed-off-by: linfeng-yuan <1102311262@qq.com> 2025-08-18 15:00:37 +08:00			`]`

			`for i, (expected_name,`
			`expected_path) in enumerate(expected_registrations):`
			`args, kwargs = call_args_list[i]`
			`self.assertEqual(args[0], expected_name)`
			`self.assertEqual(args[1], expected_path)`
[Refactor] cleanup converting_weight_acl_format_format (#2482) move maybe_converting_weight_acl_format_format to torchair module, it's only used with 310p+torchair - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/49ab23b3ccc2da9274c739d55f9b19206078c7a9 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-08-25 19:48:55 +08:00
			`@mock.patch('torch_npu.get_npu_format')`
			`@mock.patch('torch_npu.npu_format_cast')`
			`@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',`
			`new=mock.MagicMock)`
			`def test_converting_weight_acl_format(self, mock_npu_cast,`
			`mock_get_format):`
			`ACL_FORMAT_FRACTAL_NZ = 29`
			`mock_get_format.return_value = 1`
			`mock_npu_cast.return_value = 1`

			`fused_moe = mock.MagicMock()`
			`fused_moe.w13_weight = mock.MagicMock()`
			`fused_moe.w2_weight = mock.MagicMock()`
			`fused_moe.w13_weight.data = torch.randn(128, 256)`
			`fused_moe.w2_weight.data = torch.randn(256, 128)`
			`model = mock.MagicMock()`
			`model.modules.return_value = [fused_moe]`

			`utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)`
			`self.assertEqual(fused_moe.w13_weight.data, 1)`

			`@mock.patch('torch_npu.get_npu_format')`
			`@mock.patch('torch_npu.npu_format_cast')`
			`@mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE',`
			`new=mock.MagicMock)`
			`def test_converting_weight_acl_format_format_true(self, mock_npu_cast,`
			`mock_get_format):`
			`ACL_FORMAT_FRACTAL_NZ = 29`
			`mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ`
			`mock_npu_cast.return_value = 1`

			`fused_moe = mock.MagicMock()`
			`fused_moe.w13_weight = mock.MagicMock()`
			`fused_moe.w2_weight = mock.MagicMock()`
			`fused_moe.w13_weight.data = torch.randn(128, 256)`
			`fused_moe.w2_weight.data = torch.randn(256, 128)`
			`model = mock.MagicMock()`
			`model.modules.return_value = [fused_moe]`

			`utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ)`
			`mock_npu_cast.assert_not_called()`
[3/N][refactor] refactoer quantization (#2504) ### What this PR does / why we need it? Move torchair related qunatization section into torchair dir to make the code clear. Next step we'll remove all torchair related code outside of torchair quantization. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? vLLM version: main vLLM main: https://github.com/vllm-project/vllm/commit/ab9f2cfd1942f7ddfee658ce86ea96b4789862af - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/959783fb996d0d15598f45ca12ffcbee4b681424 Signed-off-by: hust17yixuan <303660421@qq.com> 2025-08-27 10:45:50 +08:00
			`def test_torchair_quant_method_register(self):`

			`TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[`
			`"W8A8_DYNAMIC"]`
			`TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[`
			`"W4A8_DYNAMIC"]`
			`utils.torchair_quant_method_register()`
			`self.assertNotEqual(TorchairW8A8DYNAMICQuantizer,`
			`SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"])`
			`self.assertNotEqual(TorchairW4A8DYNAMICQuantizer,`
			`SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"])`