[0.11.0]Chery pick pta upgrade change (#3940)
This PR cherry-pick two commit from main to upgrade torch-npu to 2.7.1 official release --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -7,7 +7,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
|
||||
from vllm_ascend.utils import version_check
|
||||
|
||||
|
||||
def mock_rms_norm(x, weight, eps):
|
||||
@@ -18,15 +17,6 @@ def mock_add_rms_norm(x, residual, weight, eps):
|
||||
return 2 * x, None, 2 * residual
|
||||
|
||||
|
||||
def mock_add_rms_norm_quant(x, residual, weight, quant_scale, quant_offset,
|
||||
epsilon):
|
||||
x_out = 2 * x
|
||||
residual_out = 2 * residual
|
||||
x_out_quant = x_out.to(torch.int8)
|
||||
residual_out_quant = residual_out.to(torch.int8)
|
||||
return x_out_quant, None, residual_out_quant
|
||||
|
||||
|
||||
def mock_add_rms_norm_quant_with_bias(x, residual, weight, quant_scale,
|
||||
quant_offset, beta, epsilon):
|
||||
x_out = 2 * x
|
||||
@@ -43,10 +33,8 @@ class TestAscendRMSNorm(PytestBase):
|
||||
mocker.patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
|
||||
mocker.patch("torch_npu.npu_add_rms_norm",
|
||||
side_effect=mock_add_rms_norm)
|
||||
torch_npu_check = version_check()
|
||||
arnq_side_effect = mock_add_rms_norm_quant_with_bias if torch_npu_check else mock_add_rms_norm_quant
|
||||
mocker.patch("torch_npu.npu_add_rms_norm_quant",
|
||||
side_effect=arnq_side_effect)
|
||||
side_effect=mock_add_rms_norm_quant_with_bias)
|
||||
mocker.patch("torch.ops.vllm.maybe_wait_prefetch_done",
|
||||
side_effect=lambda x: None)
|
||||
|
||||
@@ -82,8 +70,7 @@ class TestAscendRMSNorm(PytestBase):
|
||||
|
||||
mock_model_instance = mocker.MagicMock()
|
||||
mock_forward_context.model_instance = mock_model_instance
|
||||
torch_npu_check = version_check()
|
||||
num_hidden_layers = 3 if torch_npu_check else 2
|
||||
num_hidden_layers = 3
|
||||
mock_model_instance.model.layers = [
|
||||
mocker.MagicMock() for _ in range(num_hidden_layers)
|
||||
]
|
||||
@@ -136,37 +123,34 @@ class TestAscendRMSNorm(PytestBase):
|
||||
assert mock_forward_context.fusion_linear == "gate_up_dense"
|
||||
assert mock_forward_context.layer_idx == 1
|
||||
|
||||
if torch_npu_check:
|
||||
mock_forward_context.fusion_linear = "gate_moe"
|
||||
mock_forward_context.fusion_linear = "gate_moe"
|
||||
x_out, residual_out = layer.forward_oot(x, residual)
|
||||
|
||||
assert mock_get_forward_context.call_count == 5
|
||||
fusion_linear_expected = "qkv_moe"
|
||||
assert mock_forward_context.fusion_linear == fusion_linear_expected
|
||||
assert mock_forward_context.layer_idx == 2
|
||||
|
||||
x_out, residual_out = layer.forward_oot(x, residual)
|
||||
|
||||
assert mock_get_forward_context.call_count == 6
|
||||
fusion_linear_expected = "qkv_moe" if torch_npu_check else "qkv_dense"
|
||||
fusion_linear_expected = "gate_moe"
|
||||
assert mock_forward_context.fusion_linear == fusion_linear_expected
|
||||
assert mock_forward_context.layer_idx == 2
|
||||
|
||||
# last layer returned directly
|
||||
x_out, residual_out = layer.forward_oot(x, residual)
|
||||
|
||||
assert mock_get_forward_context.call_count == 7
|
||||
fusion_linear_expected = "gate_moe" if torch_npu_check else "qkv_dense"
|
||||
assert mock_forward_context.fusion_linear == fusion_linear_expected
|
||||
assert mock_forward_context.layer_idx == 2
|
||||
assert mock_forward_context.fusion_linear == "qkv_moe"
|
||||
assert mock_forward_context.layer_idx == 3
|
||||
|
||||
if not torch_npu_check:
|
||||
return
|
||||
# last layer returned directly
|
||||
x_out, residual_out = layer.forward_oot(x, residual)
|
||||
|
||||
assert mock_get_forward_context.call_count == 8
|
||||
assert mock_forward_context.fusion_linear == "qkv_moe"
|
||||
assert mock_forward_context.layer_idx == 3
|
||||
|
||||
x_out, residual_out = layer.forward_oot(x, residual)
|
||||
|
||||
assert mock_get_forward_context.call_count == 9
|
||||
assert mock_forward_context.fusion_linear == "qkv_moe"
|
||||
assert mock_forward_context.layer_idx == 3
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -23,9 +23,9 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
|
||||
@patch("torch_npu.npu_swiglu")
|
||||
@patch("torch_npu.npu_dynamic_quant")
|
||||
@patch("torch_npu.npu_moe_finalize_routing")
|
||||
@patch("torch_npu.npu_moe_init_routing")
|
||||
@patch("torch_npu.npu_moe_init_routing_quant")
|
||||
def test_torchair_fused_experts_with_all2all(
|
||||
self, mock_moe_init_routing, mock_moe_finalize_routing,
|
||||
self, mock_npu_moe_init_routing_quant, mock_moe_finalize_routing,
|
||||
mock_dynamic_quant, mock_swiglu, mock_grouped_matmul,
|
||||
mock_moe_re_routing, mock_all_to_all_single):
|
||||
|
||||
@@ -38,11 +38,10 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
|
||||
placeholder_ones = torch.ones(self.num_tokens, dtype=torch.int32)
|
||||
mock_all_to_all_single.side_effect = lambda output, input, *args, **kwargs: output.copy_(
|
||||
input)
|
||||
mock_moe_init_routing.return_value = (
|
||||
placeholder_int8,
|
||||
placeholder_ones,
|
||||
placeholder_ones,
|
||||
)
|
||||
mock_npu_moe_init_routing_quant.return_value = (
|
||||
placeholder_int8, placeholder_ones, placeholder_ones,
|
||||
torch.bincount(placeholder_ones, minlength=len(expert_map)),
|
||||
torch.randn(self.num_tokens))
|
||||
mock_moe_re_routing.return_value = (placeholder_int8, self.placeholder,
|
||||
torch.randint(0,
|
||||
100,
|
||||
|
||||
Reference in New Issue
Block a user