diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py index 0e37a6d8..948338f1 100644 --- a/tests/e2e/singlecard/test_aclgraph_accuracy.py +++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py @@ -36,10 +36,10 @@ CASE_DS_ACLGRAPH = LLMTestCase( quantization="ascend", prompts=PROMPTS_SHORT, golden_answers=[ - '\nI am a 20 year old student from the UK. I am currently studying for a degree in English Literature and Creative Writing. I have a passion', + '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2', ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the', ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art', - ' here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is' + ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of' ], ) diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py index d27da5cf..ff9e3cc1 100644 --- a/tests/ut/ops/test_token_dispatcher.py +++ b/tests/ut/ops/test_token_dispatcher.py @@ -164,11 +164,11 @@ class TestTokenDispatcherWithAllGather(TestBase): self.dispatcher = TokenDispatcherWithAllGather(**kwargs) # Mock NPU functions - self.patcher_npu_moe_init_routing_v2 = patch( - 'torch_npu.npu_moe_init_routing_v2') - self.mock_npu_moe_init_routing_v2 = self.patcher_npu_moe_init_routing_v2.start( + self.patcher_npu_moe_init_routing_custom = patch( + 'torch.ops._C_ascend.npu_moe_init_routing_custom') + self.mock_npu_moe_init_routing_custom = self.patcher_npu_moe_init_routing_custom.start( ) - self.mock_npu_moe_init_routing_v2.return_value = ( + self.mock_npu_moe_init_routing_custom.return_value = ( torch.randn(6, 128), # sorted_hidden_states torch.tensor([0, 1, 2, 3, 4, 5]), # expanded_row_idx torch.tensor([0, 1, 0, 1, 0, 1]), # expanded_expert_idx @@ -180,7 +180,7 @@ class TestTokenDispatcherWithAllGather(TestBase): self.mock_npu_moe_token_unpermute.return_value = torch.randn(6, 128) def tearDown(self): - self.patcher_npu_moe_init_routing_v2.stop() + self.patcher_npu_moe_init_routing_custom.stop() self.patcher_npu_moe_token_unpermute.stop() def test_token_dispatch_without_expert_map(self): @@ -192,8 +192,8 @@ class TestTokenDispatcherWithAllGather(TestBase): topk_ids, None) # Verify npu_moe_init_routing is called - self.mock_npu_moe_init_routing_v2.assert_called_once() - args, kwargs = self.mock_npu_moe_init_routing_v2.call_args + self.mock_npu_moe_init_routing_custom.assert_called_once() + args, kwargs = self.mock_npu_moe_init_routing_custom.call_args self.assertEqual(results.group_list_type, 1) @@ -207,8 +207,8 @@ class TestTokenDispatcherWithAllGather(TestBase): topk_ids, None) # Verify npu_moe_init_routing is called - self.mock_npu_moe_init_routing_v2.assert_called_once() - args, kwargs = self.mock_npu_moe_init_routing_v2.call_args + self.mock_npu_moe_init_routing_custom.assert_called_once() + args, kwargs = self.mock_npu_moe_init_routing_custom.call_args self.assertEqual(results.group_list_type, 1) @@ -366,11 +366,11 @@ class TestTokenDispatcherWithAll2AllV(TestBase): self.mock_npu_dynamic_quant.return_value = (torch.randn(16, 16), torch.randn(16)) - # Mock torch_npu.npu_moe_init_routing_v2 - patcher11 = patch('torch_npu.npu_moe_init_routing_v2') - self.mock_npu_moe_init_routing_v2 = patcher11.start() + # Mock torch.ops._C_ascend.npu_moe_init_routing_custom + patcher11 = patch('torch.ops._C_ascend.npu_moe_init_routing_custom') + self.mock_npu_moe_init_routing_custom = patcher11.start() self.addCleanup(patcher11.stop) - self.mock_npu_moe_init_routing_v2.return_value = (torch.randn( + self.mock_npu_moe_init_routing_custom.return_value = (torch.randn( 16, 16), torch.arange(16), None, torch.randn(16)) # Mock torch.repeat_interleave diff --git a/vllm_ascend/ops/fused_moe/token_dispatcher.py b/vllm_ascend/ops/fused_moe/token_dispatcher.py index b40a0583..d90f4c71 100644 --- a/vllm_ascend/ops/fused_moe/token_dispatcher.py +++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py @@ -354,7 +354,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher): global_num_experts = self.num_experts_local sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = ( - torch_npu.npu_moe_init_routing_v2( + torch.ops._C_ascend.npu_moe_init_routing_custom( hidden_states, topk_ids, scale=pertoken_scale,