Revert "[FEAT] Support GGUF format" (#2285)

This commit is contained in:
Lianmin Zheng
2024-11-30 19:03:26 -08:00
committed by GitHub
parent d622851dc9
commit 7e4c6dd8da
39 changed files with 89 additions and 180 deletions

View File

@@ -557,29 +557,6 @@ def monkey_patch_vllm_all_gather(reverse: bool = False):
setattr(GroupCoordinator, "all_gather", all_gather)
def monkey_patch_vllm_gguf_config():
from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization.gguf import (
GGUFConfig,
GGUFEmbeddingMethod,
GGUFLinearMethod,
)
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
def get_quant_method_with_embedding_replaced(
self, layer: torch.nn.Module, prefix: str
) -> Optional["QuantizeMethodBase"]:
if isinstance(layer, LinearBase):
return GGUFLinearMethod(self)
elif isinstance(layer, VocabParallelEmbedding):
# patch to own VocabParallelEmbedding
return GGUFEmbeddingMethod(self)
return None
setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
def maybe_set_triton_cache_manager() -> None:
"""Set environment variable to tell Triton to use a
custom cache manager"""