add pkgs

2025-08-06 15:49:14 +08:00
parent e80b916c52
commit bf00e72fb2
111 changed files with 21880 additions and 1 deletions
--- a/examples/qwen/smoothquant.py
+++ b/examples/qwen/smoothquant.py
@@ -0,0 +1,209 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Utilities for SmoothQuant models
+'''
+
+import functools
+import os
+import sys
+from collections import defaultdict
+
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers.pytorch_utils import Conv1D
+
+project_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(project_dir)
+from utils.utils import make_context
+
+
+@torch.no_grad()
+def apply_smoothing(scales,
+                    gemm_weights,
+                    rmsnorm_weights=None,
+                    dtype=torch.float32,
+                    rmsnorm_1p=False):
+    if not isinstance(gemm_weights, list):
+        gemm_weights = [gemm_weights]
+
+    if rmsnorm_weights is not None:
+        assert rmsnorm_weights.numel() == scales.numel()
+        rmsnorm_weights.div_(scales).to(dtype)
+    if rmsnorm_1p:
+        rmsnorm_weights += (1 / scales) - 1
+
+    for gemm in gemm_weights:
+        gemm.mul_(scales.view(1, -1)).to(dtype)
+
+
+@torch.no_grad()
+def smooth_gemm(gemm_weights,
+                act_scales,
+                rmsnorm_weights=None,
+                alpha=0.5,
+                weight_scales=None):
+    if not isinstance(gemm_weights, list):
+        gemm_weights = [gemm_weights]
+    orig_dtype = gemm_weights[0].dtype
+
+    for gemm in gemm_weights:
+        # gemm_weights are expected to be transposed
+        assert gemm.shape[1] == act_scales.numel()
+
+    if weight_scales is None:
+        weight_scales = torch.cat(
+            [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights],
+            dim=0)
+        weight_scales = weight_scales.max(dim=0)[0]
+    weight_scales.to(float).clamp(min=1e-5)
+    scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5)
+
+    apply_smoothing(scales, gemm_weights, rmsnorm_weights, orig_dtype)
+
+    return scales
+
+
+@torch.no_grad()
+def smooth_gemm_mlp(w1_weights,
+                    w2_weights,
+                    act_scales,
+                    rmsnorm_weights=None,
+                    alpha=0.5,
+                    weight_scales=None):
+    gemm_weights = []
+    if not isinstance(w1_weights, list):
+        w1_weights = [w1_weights]
+    if not isinstance(w2_weights, list):
+        w2_weights = [w2_weights]
+
+    for i in range(len(w1_weights)):
+        gemm_weight = torch.cat([w1_weights[i], w2_weights[i]], dim=0)
+        gemm_weights.append(gemm_weight)
+
+    orig_dtype = gemm_weights[0].dtype
+
+    for gemm in gemm_weights:
+        # gemm_weights are expected to be transposed
+        assert gemm.shape[1] == act_scales.numel()
+
+    if weight_scales is None:
+        weight_scales = torch.cat(
+            [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights],
+            dim=0)
+        weight_scales = weight_scales.max(dim=0)[0]
+    weight_scales.to(float).clamp(min=1e-5)
+    scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5)
+
+    apply_smoothing(scales, w1_weights + w2_weights, rmsnorm_weights,
+                    orig_dtype)
+
+    return scales
+
+
+@torch.no_grad()
+def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
+    if not isinstance(fcs, list):
+        fcs = [fcs]
+    for fc in fcs:
+        assert isinstance(fc, nn.Linear)
+        assert ln.weight.numel() == fc.in_features == act_scales.numel()
+
+    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
+    act_scales = act_scales.to(device=device, dtype=dtype)
+    weight_scales = torch.cat(
+        [fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0)
+    weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
+
+    scales = (act_scales.pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5).to(device).to(dtype)
+
+    if ln is not None:
+        ln.weight.div_(scales)
+        ln.bias.div_(scales)
+
+    for fc in fcs:
+        fc.weight.mul_(scales.view(1, -1))
+    return scales
+
+
+@torch.no_grad()
+def capture_activation_range(
+    model,
+    tokenizer,
+    dataset,
+    system_prompt,
+    chat_format,
+    max_input_len,
+    num_samples=512,
+):
+    model.eval()
+    device = next(model.parameters()).device
+    act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None})
+
+    def stat_tensor(name, tensor, act_scales, key):
+        hidden_dim = tensor.shape[-1]
+        tensor = tensor.view(-1, hidden_dim).abs().detach()
+        comming_max = torch.max(tensor, dim=0)[0].float()
+
+        if act_scales[name][key] is None:
+            act_scales[name][key] = comming_max
+        else:
+            act_scales[name][key] = torch.max(act_scales[name][key],
+                                              comming_max)
+
+    def stat_input_hook(m, x, y, name):
+        if isinstance(x, tuple):
+            x = x[0]
+        stat_tensor(name, x, act_scales, "x")
+        stat_tensor(name, y, act_scales, "y")
+
+        if act_scales[name]["w"] is None:
+            act_scales[name]["w"] = m.weight.abs().clip(1e-8,
+                                                        None).max(dim=1)[0]
+
+    hooks = []
+    for name, m in model.named_modules():
+        if isinstance(m, nn.Linear) or isinstance(m, Conv1D):
+            hooks.append(
+                m.register_forward_hook(
+                    functools.partial(stat_input_hook, name=name)))
+    num_samples = min(num_samples, len(dataset))
+    for i in tqdm(range(num_samples), desc="calibrating model"):
+        line = dataset[i]["article"]
+        line = line + ' TL;DR: '
+        line = line.strip()
+        line = line.replace(" n't", "n't")
+        # use make_content to generate prompt
+        _, input_id_list = make_context(tokenizer=tokenizer,
+                                        query=line,
+                                        history=[],
+                                        system=system_prompt,
+                                        chat_format=chat_format,
+                                        max_input_length=max_input_len)
+        line_encoded = torch.from_numpy(np.array(
+            input_id_list, dtype=np.int32)).type(torch.int32).unsqueeze(0)
+        line_encoded = line_encoded.to(device)
+        model(line_encoded)
+
+    for h in hooks:
+        h.remove()
+
+    return act_scales