[FEAT] Support GGUF format (#2215)

Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>
2024-11-30 16:44:48 +08:00
parent 0d6a49bd7d
commit 883c955489
39 changed files with 180 additions and 89 deletions
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -557,6 +557,29 @@ def monkey_patch_vllm_all_gather(reverse: bool = False):
        setattr(GroupCoordinator, "all_gather", all_gather)


+def monkey_patch_vllm_gguf_config():
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.quantization.gguf import (
+        GGUFConfig,
+        GGUFEmbeddingMethod,
+        GGUFLinearMethod,
+    )
+
+    from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+
+    def get_quant_method_with_embedding_replaced(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return GGUFLinearMethod(self)
+        elif isinstance(layer, VocabParallelEmbedding):
+            # patch to own VocabParallelEmbedding
+            return GGUFEmbeddingMethod(self)
+        return None
+
+    setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
+
+
 def maybe_set_triton_cache_manager() -> None:
    """Set environment variable to tell Triton to use a
    custom cache manager"""