[QUANT] Add GPTQModel Dynamic Quantization + lm_head Quantization (#3790)

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai>
This commit is contained in:
Qubitium-ModelCloud
2025-03-05 17:11:00 +08:00
committed by GitHub
parent 583d6af71b
commit 56a724eba3
56 changed files with 1988 additions and 282 deletions

View File

@@ -56,6 +56,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.utils import set_default_torch_dtype
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
from sglang.srt.utils import add_prefix
RawImageType = Union[Image.Image, torch.Tensor]
@@ -158,14 +159,14 @@ class Idefics2VisionMLP(nn.Module):
config.intermediate_size,
bias=True,
quant_config=quant_config,
prefix=f"{prefix}.fc1",
prefix=add_prefix("fc1", prefix),
)
self.fc2 = RowParallelLinear(
config.intermediate_size,
config.hidden_size,
bias=True,
quant_config=quant_config,
prefix=f"{prefix}.fc2",
prefix=add_prefix("fc2", prefix),
)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -199,10 +200,14 @@ class Idefics2EncoderLayer(nn.Module):
use_context_forward=False,
use_full_precision_softmax=True,
flatten_batch=False,
prefix=f"{prefix}.self_attn",
prefix=add_prefix("self_attn", prefix),
)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = Idefics2VisionMLP(config, quant_config=quant_config)
self.mlp = Idefics2VisionMLP(
config,
quant_config=quant_config,
prefix=add_prefix("mlp", prefix),
)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
@@ -242,6 +247,7 @@ class Idefics2Encoder(nn.Module):
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
super().__init__()
@@ -251,8 +257,9 @@ class Idefics2Encoder(nn.Module):
Idefics2EncoderLayer(
config,
quant_config=quant_config,
prefix=add_prefix(f"layers.{i}", prefix),
)
for _ in range(config.num_hidden_layers)
for i in range(config.num_hidden_layers)
]
)
@@ -379,13 +386,18 @@ class Idefics2VisionTransformer(nn.Module):
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
super().__init__()
embed_dim = config.hidden_size
self.config = config
self.embeddings = Idefics2VisionEmbeddings(config)
self.encoder = Idefics2Encoder(config=config, quant_config=quant_config)
self.encoder = Idefics2Encoder(
config=config,
quant_config=quant_config,
prefix=add_prefix("encoder", prefix),
)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
def get_input_embeddings(self):
@@ -503,7 +515,7 @@ class BaseResampler(nn.Module):
embed_dim,
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.kv_proj",
prefix=add_prefix("kv_proj", prefix),
)
else:
# Maintain the same return value with ReplicatedLinear.forward
@@ -660,6 +672,7 @@ class MiniCPMVBaseModel(nn.Module):
*,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super().__init__()
# All MiniCPM-V models disable `tie_word_embeddings` but
@@ -669,8 +682,12 @@ class MiniCPMVBaseModel(nn.Module):
self.config = config
self.version = get_version_by_config(self.config)
self.llm = self.init_llm(config=config, quant_config=quant_config)
self.vpm = self.init_vision_module(config, quant_config)
self.llm = self.init_llm(
config=config, quant_config=quant_config, prefix=add_prefix("llm", prefix)
)
self.vpm = self.init_vision_module(
config, quant_config, add_prefix("vpm", prefix)
)
self.vision_dim = (
self.vpm.embed_dim
if self.version == (2, 0)
@@ -679,7 +696,10 @@ class MiniCPMVBaseModel(nn.Module):
self.embed_dim = self.config.hidden_size
self.resampler = self.init_resampler(
self.embed_dim, self.vision_dim, quant_config=quant_config
self.embed_dim,
self.vision_dim,
quant_config=quant_config,
prefix=add_prefix("resampler", prefix),
)
self.logits_processor = LogitsProcessor(config)
@@ -937,6 +957,7 @@ class MiniCPMVBaseModel(nn.Module):
self,
config: Qwen2Config,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
raise NotImplementedError
@@ -944,6 +965,7 @@ class MiniCPMVBaseModel(nn.Module):
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
prefix: str = "",
) -> nn.Module:
raise NotImplementedError
@@ -952,6 +974,7 @@ class MiniCPMVBaseModel(nn.Module):
embed_dim: int,
vision_dim: int,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
raise NotImplementedError
@@ -1011,24 +1034,27 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super().__init__(config=config, quant_config=quant_config)
super().__init__(config=config, quant_config=quant_config, prefix=prefix)
assert self.version == (2, 6)
def init_llm(
self,
config: Qwen2Config,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
return Qwen2ForCausalLM(config=config, quant_config=quant_config)
return Qwen2ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
prefix: str = "",
) -> nn.Module:
model = Idefics2VisionTransformer(
config=config.vision_config, quant_config=quant_config
config=config.vision_config, quant_config=quant_config, prefix=prefix
)
if self.config.drop_vision_last_layer:
model.encoder.layers = model.encoder.layers[:-1]
@@ -1042,6 +1068,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
embed_dim: int,
vision_dim: int,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
with set_default_torch_dtype(torch.float16):
# The resampler in 2.6 remains consistent with the one in 2.5.
@@ -1051,6 +1078,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
num_heads=embed_dim // 128,
kv_dim=vision_dim,
quant_config=quant_config,
prefix=prefix,
)
return resampler.to(device="cuda", dtype=torch.get_default_dtype())
@@ -1207,6 +1235,7 @@ class MiniCPMV:
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
super().__init__()
@@ -1221,7 +1250,9 @@ class MiniCPMV:
raise ValueError("Currently, MiniCPMV only supports versions 2.6")
try:
minicpmv = instance_class(config=config, quant_config=quant_config)
minicpmv = instance_class(
config=config, quant_config=quant_config, prefix=prefix
)
self.minicpmv = minicpmv
except Exception as e:
print(f"Failed to instantiate MiniCPMV: {e}")