init v0.11.0rc0
This commit is contained in:
@@ -1,29 +0,0 @@
|
||||
from vllm_ascend.quantization.quantizer import VLLMAscendQuantizer
|
||||
from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import (
|
||||
TorchairAscendW4A8DynamicFusedMoEMethod,
|
||||
TorchairAscendW4A8DynamicLinearMethod)
|
||||
from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import (
|
||||
TorchairAscendW8A8DynamicFusedMoEMethod,
|
||||
TorchairAscendW8A8DynamicLinearMethod)
|
||||
|
||||
|
||||
class TorchairW8A8DYNAMICQuantizer(VLLMAscendQuantizer):
|
||||
|
||||
@staticmethod
|
||||
def build_linear_method():
|
||||
return TorchairAscendW8A8DynamicLinearMethod()
|
||||
|
||||
@staticmethod
|
||||
def build_moe_method():
|
||||
return TorchairAscendW8A8DynamicFusedMoEMethod()
|
||||
|
||||
|
||||
class TorchairW4A8DYNAMICQuantizer(VLLMAscendQuantizer):
|
||||
|
||||
@staticmethod
|
||||
def build_linear_method():
|
||||
return TorchairAscendW4A8DynamicLinearMethod()
|
||||
|
||||
@staticmethod
|
||||
def build_moe_method():
|
||||
return TorchairAscendW4A8DynamicFusedMoEMethod()
|
||||
@@ -139,6 +139,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
vllm_config = get_current_vllm_config()
|
||||
self.group_size = vllm_config.quant_config.quant_description.get(
|
||||
"group_size", 256)
|
||||
# NOTE: the weights are quantized from bf16 to int4 through a per-channel quantization process
|
||||
self.is_per_channel_weight = self.group_size == 0
|
||||
quant_version = vllm_config.quant_config.quant_description.get(
|
||||
"version", "0")
|
||||
# NOTE: new quantize weights: 2 int4 pack into int8
|
||||
@@ -188,44 +190,45 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
1,
|
||||
dtype=params_dtype)
|
||||
dtype=torch.float32)
|
||||
|
||||
param_dict["w13_weight_offset"] = torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
1,
|
||||
dtype=params_dtype)
|
||||
|
||||
param_dict["w13_weight_scale_second"] = torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
hidden_sizes // self.group_size,
|
||||
dtype=params_dtype)
|
||||
|
||||
param_dict["w13_weight_offset_second"] = torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
hidden_sizes // self.group_size,
|
||||
dtype=params_dtype)
|
||||
dtype=torch.float32)
|
||||
|
||||
param_dict["w2_weight_scale"] = torch.empty(num_experts,
|
||||
hidden_sizes,
|
||||
1,
|
||||
dtype=params_dtype)
|
||||
dtype=torch.float32)
|
||||
param_dict["w2_weight_offset"] = torch.empty(num_experts,
|
||||
hidden_sizes,
|
||||
1,
|
||||
dtype=params_dtype)
|
||||
param_dict["w2_weight_scale_second"] = torch.empty(
|
||||
num_experts,
|
||||
hidden_sizes,
|
||||
intermediate_size_per_partition // self.group_size,
|
||||
dtype=params_dtype)
|
||||
param_dict["w2_weight_offset_second"] = torch.empty(
|
||||
num_experts,
|
||||
hidden_sizes,
|
||||
intermediate_size_per_partition // self.group_size,
|
||||
dtype=params_dtype)
|
||||
dtype=torch.float32)
|
||||
|
||||
if not self.is_per_channel_weight:
|
||||
param_dict["w13_weight_scale_second"] = torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
hidden_sizes // self.group_size,
|
||||
dtype=torch.float32)
|
||||
param_dict["w13_weight_offset_second"] = torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
hidden_sizes // self.group_size,
|
||||
dtype=torch.float32)
|
||||
|
||||
param_dict["w2_weight_scale_second"] = torch.empty(
|
||||
num_experts,
|
||||
hidden_sizes,
|
||||
intermediate_size_per_partition // self.group_size,
|
||||
dtype=torch.float32)
|
||||
param_dict["w2_weight_offset_second"] = torch.empty(
|
||||
num_experts,
|
||||
hidden_sizes,
|
||||
intermediate_size_per_partition // self.group_size,
|
||||
dtype=torch.float32)
|
||||
|
||||
if self.new_quant_version:
|
||||
param_dict["w13_scale_bias"] = torch.empty(
|
||||
@@ -318,8 +321,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
w2=layer.w2_weight,
|
||||
w1_scale=layer.w13_weight_scale_second,
|
||||
w2_scale=layer.w2_weight_scale_second,
|
||||
w1_scale=layer.w13_weight_scale,
|
||||
w2_scale=layer.w2_weight_scale,
|
||||
w1_scale_bias=layer.w13_scale_bias,
|
||||
w2_scale_bias=layer.w2_scale_bias,
|
||||
topk_weights=topk_weights,
|
||||
@@ -343,8 +346,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
w2=layer.w2_weight,
|
||||
w1_scale=layer.w13_weight_scale_second,
|
||||
w2_scale=layer.w2_weight_scale_second,
|
||||
w1_scale=layer.w13_weight_scale,
|
||||
w2_scale=layer.w2_weight_scale,
|
||||
w1_scale_bias=layer.w13_scale_bias,
|
||||
w2_scale_bias=layer.w2_scale_bias,
|
||||
topk_weights=topk_weights,
|
||||
@@ -357,6 +360,14 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
)
|
||||
|
||||
def process_scale(self, weight: torch.Tensor, scale, per_group_scale):
|
||||
scale = scale.transpose(1, 2).contiguous()
|
||||
if self.is_per_channel_weight:
|
||||
scale_np = scale.cpu().numpy()
|
||||
scale_np.dtype = np.uint32
|
||||
scale_uint64_tensor = torch.from_numpy(scale_np.astype(
|
||||
np.int64)).npu()
|
||||
return scale_uint64_tensor, None
|
||||
per_group_scale = per_group_scale.transpose(1, 2).contiguous()
|
||||
group_num, k, n = weight.shape
|
||||
# the weight of the new version is reduced by half by pack n, so it needs to be restored
|
||||
if self.new_quant_version:
|
||||
@@ -399,13 +410,10 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
|
||||
def pack_to_int32(self, weight: torch.Tensor):
|
||||
if self.new_quant_version:
|
||||
group_num, k, n = weight.shape
|
||||
assert n % 4 == 0, "the last dim of weight needs to be divided by 4"
|
||||
packed_n = n // 4
|
||||
# pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
|
||||
packed_weight = torch.from_numpy(
|
||||
np.frombuffer(weight.cpu().numpy().tobytes(), dtype=np.int32))
|
||||
return packed_weight.reshape(group_num, k, packed_n).npu()
|
||||
assert weight.shape[
|
||||
-1] % 4 == 0, "the last dim of weight needs to be divided by 4"
|
||||
return weight.view(torch.int32).contiguous()
|
||||
else:
|
||||
return torch_npu.npu_quantize(weight.to(torch.float32),
|
||||
torch.tensor([1.]).npu(), None,
|
||||
@@ -417,21 +425,22 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
1, 2).contiguous()
|
||||
layer.w2_weight.data = layer.w2_weight.data.transpose(
|
||||
1, 2).contiguous()
|
||||
layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose(
|
||||
1, 2).contiguous()
|
||||
layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose(
|
||||
1, 2).contiguous()
|
||||
layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose(
|
||||
1, 2).contiguous()
|
||||
layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose(
|
||||
1, 2).contiguous()
|
||||
|
||||
layer.w13_weight_scale_second.data, w13_bias = self.process_scale(
|
||||
w13_weight_scale_second = layer.w13_weight_scale_second.data if hasattr(
|
||||
layer, "w13_weight_scale_second") else None
|
||||
w2_weight_scale_second = layer.w2_weight_scale_second.data if hasattr(
|
||||
layer, "w2_weight_scale_second") else None
|
||||
layer.w13_weight_scale.data, w13_bias = self.process_scale(
|
||||
layer.w13_weight, layer.w13_weight_scale.data,
|
||||
layer.w13_weight_scale_second.data)
|
||||
layer.w2_weight_scale_second.data, w2_bias = self.process_scale(
|
||||
w13_weight_scale_second)
|
||||
layer.w2_weight_scale.data, w2_bias = self.process_scale(
|
||||
layer.w2_weight, layer.w2_weight_scale.data,
|
||||
layer.w2_weight_scale_second.data)
|
||||
w2_weight_scale_second)
|
||||
if hasattr(layer, "w13_weight_scale_second"):
|
||||
# scale_second is no longer used, release this part of the memory
|
||||
del layer.w13_weight_scale_second
|
||||
del layer.w2_weight_scale_second
|
||||
del layer.w13_weight_offset_second
|
||||
del layer.w2_weight_offset_second
|
||||
|
||||
self.update_bias(layer, w13_bias, w2_bias)
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ import torch_npu
|
||||
from vllm.distributed import GroupCoordinator, get_ep_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import FusedMoEState
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
@@ -417,6 +416,7 @@ def torchair_fused_experts_with_all2all(
|
||||
num_experts = w1.shape[0]
|
||||
|
||||
if expert_map is not None:
|
||||
assert ep_group is not None, "ep_group must be provided when expert_map is given"
|
||||
global_num_experts = len(expert_map) + global_redundant_expert_num
|
||||
if hasattr(torch_npu, "npu_moe_init_routing_quant"):
|
||||
quantized_tokens, expanded_row_idx, global_expert_tokens, _, token_scales = torch_npu.npu_moe_init_routing_quant(
|
||||
@@ -436,8 +436,9 @@ def torchair_fused_experts_with_all2all(
|
||||
|
||||
gather_sizes = global_expert_tokens.new_empty(
|
||||
global_expert_tokens.shape[0])
|
||||
dist.all_to_all_single(gather_sizes, global_expert_tokens)
|
||||
|
||||
dist.all_to_all_single(gather_sizes,
|
||||
global_expert_tokens,
|
||||
group=ep_group.device_group)
|
||||
token_counts_combined = torch.stack(
|
||||
[gather_sizes, global_expert_tokens], dim=0)
|
||||
token_counts_combined = token_counts_combined.view(
|
||||
@@ -452,10 +453,16 @@ def torchair_fused_experts_with_all2all(
|
||||
gather_size_list = token_counts_combined_cpu[1]
|
||||
scatter_size_list = token_counts_combined_cpu[0]
|
||||
|
||||
dist.all_to_all_single(gathered_tokens, quantized_tokens,
|
||||
scatter_size_list, gather_size_list)
|
||||
dist.all_to_all_single(dynamic_scale, token_scales, scatter_size_list,
|
||||
gather_size_list)
|
||||
dist.all_to_all_single(gathered_tokens,
|
||||
quantized_tokens,
|
||||
scatter_size_list,
|
||||
gather_size_list,
|
||||
group=ep_group.device_group)
|
||||
dist.all_to_all_single(dynamic_scale,
|
||||
token_scales,
|
||||
scatter_size_list,
|
||||
gather_size_list,
|
||||
group=ep_group.device_group)
|
||||
|
||||
hidden_states, dynamic_scale, inverse_indices, expert_tokens = torch_npu.npu_moe_re_routing(
|
||||
gathered_tokens,
|
||||
@@ -503,9 +510,11 @@ def torchair_fused_experts_with_all2all(
|
||||
index=inverse_indices.to(torch.float32).argsort().to(torch.int32))
|
||||
|
||||
hidden_states = reordered_outputs.new_empty(*quantized_tokens.shape)
|
||||
dist.all_to_all_single(hidden_states, reordered_outputs,
|
||||
gather_size_list, scatter_size_list)
|
||||
|
||||
dist.all_to_all_single(hidden_states,
|
||||
reordered_outputs,
|
||||
gather_size_list,
|
||||
scatter_size_list,
|
||||
group=ep_group.device_group)
|
||||
final_hidden_states = torch_npu.npu_moe_finalize_routing(
|
||||
hidden_states,
|
||||
skip1=None,
|
||||
@@ -824,6 +833,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
|
||||
try:
|
||||
device_group = get_mc2_group().device_group
|
||||
@@ -937,6 +947,8 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
|
||||
)
|
||||
|
||||
fused_moe_state = get_forward_context().fused_moe_state
|
||||
if self.enable_shared_expert_dp and fused_moe_state == FusedMoEState.MC2:
|
||||
fused_moe_state = FusedMoEState.All2All
|
||||
shared_gate_up, shared_dequant_scale = None, None
|
||||
if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
|
||||
with npu_stream_switch("moe_secondary", 0):
|
||||
@@ -1021,8 +1033,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
|
||||
1, 2).contiguous()
|
||||
layer.w2_weight.data = layer.w2_weight.data.transpose(
|
||||
1, 2).contiguous()
|
||||
if envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP:
|
||||
torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
|
||||
torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
|
||||
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
|
||||
layer.w13_weight_scale.data.shape[0], -1)
|
||||
layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
|
||||
|
||||
Reference in New Issue
Block a user