[OP] Enable custom op aclnnMoeInitRoutingCustom (#5332)
### What this PR does / why we need it?
This PR enables custom op `aclnnMoeInitRoutingCustom` introduced in PR
#5251
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08
---------
Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
Signed-off-by: zzzzwwjj <1183291235@qq.com>
Co-authored-by: zzzzwwjj <1183291235@qq.com>
This commit is contained in:
@@ -36,10 +36,10 @@ CASE_DS_ACLGRAPH = LLMTestCase(
|
|||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
prompts=PROMPTS_SHORT,
|
prompts=PROMPTS_SHORT,
|
||||||
golden_answers=[
|
golden_answers=[
|
||||||
'\nI am a 20 year old student from the UK. I am currently studying for a degree in English Literature and Creative Writing. I have a passion',
|
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
|
||||||
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
|
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
|
||||||
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
|
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
|
||||||
' here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is'
|
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -164,11 +164,11 @@ class TestTokenDispatcherWithAllGather(TestBase):
|
|||||||
self.dispatcher = TokenDispatcherWithAllGather(**kwargs)
|
self.dispatcher = TokenDispatcherWithAllGather(**kwargs)
|
||||||
|
|
||||||
# Mock NPU functions
|
# Mock NPU functions
|
||||||
self.patcher_npu_moe_init_routing_v2 = patch(
|
self.patcher_npu_moe_init_routing_custom = patch(
|
||||||
'torch_npu.npu_moe_init_routing_v2')
|
'torch.ops._C_ascend.npu_moe_init_routing_custom')
|
||||||
self.mock_npu_moe_init_routing_v2 = self.patcher_npu_moe_init_routing_v2.start(
|
self.mock_npu_moe_init_routing_custom = self.patcher_npu_moe_init_routing_custom.start(
|
||||||
)
|
)
|
||||||
self.mock_npu_moe_init_routing_v2.return_value = (
|
self.mock_npu_moe_init_routing_custom.return_value = (
|
||||||
torch.randn(6, 128), # sorted_hidden_states
|
torch.randn(6, 128), # sorted_hidden_states
|
||||||
torch.tensor([0, 1, 2, 3, 4, 5]), # expanded_row_idx
|
torch.tensor([0, 1, 2, 3, 4, 5]), # expanded_row_idx
|
||||||
torch.tensor([0, 1, 0, 1, 0, 1]), # expanded_expert_idx
|
torch.tensor([0, 1, 0, 1, 0, 1]), # expanded_expert_idx
|
||||||
@@ -180,7 +180,7 @@ class TestTokenDispatcherWithAllGather(TestBase):
|
|||||||
self.mock_npu_moe_token_unpermute.return_value = torch.randn(6, 128)
|
self.mock_npu_moe_token_unpermute.return_value = torch.randn(6, 128)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
self.patcher_npu_moe_init_routing_v2.stop()
|
self.patcher_npu_moe_init_routing_custom.stop()
|
||||||
self.patcher_npu_moe_token_unpermute.stop()
|
self.patcher_npu_moe_token_unpermute.stop()
|
||||||
|
|
||||||
def test_token_dispatch_without_expert_map(self):
|
def test_token_dispatch_without_expert_map(self):
|
||||||
@@ -192,8 +192,8 @@ class TestTokenDispatcherWithAllGather(TestBase):
|
|||||||
topk_ids, None)
|
topk_ids, None)
|
||||||
|
|
||||||
# Verify npu_moe_init_routing is called
|
# Verify npu_moe_init_routing is called
|
||||||
self.mock_npu_moe_init_routing_v2.assert_called_once()
|
self.mock_npu_moe_init_routing_custom.assert_called_once()
|
||||||
args, kwargs = self.mock_npu_moe_init_routing_v2.call_args
|
args, kwargs = self.mock_npu_moe_init_routing_custom.call_args
|
||||||
|
|
||||||
self.assertEqual(results.group_list_type, 1)
|
self.assertEqual(results.group_list_type, 1)
|
||||||
|
|
||||||
@@ -207,8 +207,8 @@ class TestTokenDispatcherWithAllGather(TestBase):
|
|||||||
topk_ids, None)
|
topk_ids, None)
|
||||||
|
|
||||||
# Verify npu_moe_init_routing is called
|
# Verify npu_moe_init_routing is called
|
||||||
self.mock_npu_moe_init_routing_v2.assert_called_once()
|
self.mock_npu_moe_init_routing_custom.assert_called_once()
|
||||||
args, kwargs = self.mock_npu_moe_init_routing_v2.call_args
|
args, kwargs = self.mock_npu_moe_init_routing_custom.call_args
|
||||||
|
|
||||||
self.assertEqual(results.group_list_type, 1)
|
self.assertEqual(results.group_list_type, 1)
|
||||||
|
|
||||||
@@ -366,11 +366,11 @@ class TestTokenDispatcherWithAll2AllV(TestBase):
|
|||||||
self.mock_npu_dynamic_quant.return_value = (torch.randn(16, 16),
|
self.mock_npu_dynamic_quant.return_value = (torch.randn(16, 16),
|
||||||
torch.randn(16))
|
torch.randn(16))
|
||||||
|
|
||||||
# Mock torch_npu.npu_moe_init_routing_v2
|
# Mock torch.ops._C_ascend.npu_moe_init_routing_custom
|
||||||
patcher11 = patch('torch_npu.npu_moe_init_routing_v2')
|
patcher11 = patch('torch.ops._C_ascend.npu_moe_init_routing_custom')
|
||||||
self.mock_npu_moe_init_routing_v2 = patcher11.start()
|
self.mock_npu_moe_init_routing_custom = patcher11.start()
|
||||||
self.addCleanup(patcher11.stop)
|
self.addCleanup(patcher11.stop)
|
||||||
self.mock_npu_moe_init_routing_v2.return_value = (torch.randn(
|
self.mock_npu_moe_init_routing_custom.return_value = (torch.randn(
|
||||||
16, 16), torch.arange(16), None, torch.randn(16))
|
16, 16), torch.arange(16), None, torch.randn(16))
|
||||||
|
|
||||||
# Mock torch.repeat_interleave
|
# Mock torch.repeat_interleave
|
||||||
|
|||||||
@@ -354,7 +354,7 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
|
|||||||
global_num_experts = self.num_experts_local
|
global_num_experts = self.num_experts_local
|
||||||
|
|
||||||
sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = (
|
sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = (
|
||||||
torch_npu.npu_moe_init_routing_v2(
|
torch.ops._C_ascend.npu_moe_init_routing_custom(
|
||||||
hidden_states,
|
hidden_states,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
scale=pertoken_scale,
|
scale=pertoken_scale,
|
||||||
|
|||||||
Reference in New Issue
Block a user