[QUANT] Add GPTQModel Dynamic Quantization + lm_head Quantization (#3790)
Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai> Co-authored-by: ZX-ModelCloud <zx@modelcloud.ai>
This commit is contained in:
committed by
GitHub
parent
583d6af71b
commit
56a724eba3
@@ -56,6 +56,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_loader.utils import set_default_torch_dtype
|
||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
|
||||
from sglang.srt.utils import add_prefix
|
||||
|
||||
RawImageType = Union[Image.Image, torch.Tensor]
|
||||
|
||||
@@ -158,14 +159,14 @@ class Idefics2VisionMLP(nn.Module):
|
||||
config.intermediate_size,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.fc1",
|
||||
prefix=add_prefix("fc1", prefix),
|
||||
)
|
||||
self.fc2 = RowParallelLinear(
|
||||
config.intermediate_size,
|
||||
config.hidden_size,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.fc2",
|
||||
prefix=add_prefix("fc2", prefix),
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
@@ -199,10 +200,14 @@ class Idefics2EncoderLayer(nn.Module):
|
||||
use_context_forward=False,
|
||||
use_full_precision_softmax=True,
|
||||
flatten_batch=False,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
prefix=add_prefix("self_attn", prefix),
|
||||
)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = Idefics2VisionMLP(config, quant_config=quant_config)
|
||||
self.mlp = Idefics2VisionMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("mlp", prefix),
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
@@ -242,6 +247,7 @@ class Idefics2Encoder(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
@@ -251,8 +257,9 @@ class Idefics2Encoder(nn.Module):
|
||||
Idefics2EncoderLayer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix(f"layers.{i}", prefix),
|
||||
)
|
||||
for _ in range(config.num_hidden_layers)
|
||||
for i in range(config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
|
||||
@@ -379,13 +386,18 @@ class Idefics2VisionTransformer(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
embed_dim = config.hidden_size
|
||||
self.config = config
|
||||
self.embeddings = Idefics2VisionEmbeddings(config)
|
||||
self.encoder = Idefics2Encoder(config=config, quant_config=quant_config)
|
||||
self.encoder = Idefics2Encoder(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("encoder", prefix),
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
@@ -503,7 +515,7 @@ class BaseResampler(nn.Module):
|
||||
embed_dim,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.kv_proj",
|
||||
prefix=add_prefix("kv_proj", prefix),
|
||||
)
|
||||
else:
|
||||
# Maintain the same return value with ReplicatedLinear.forward
|
||||
@@ -660,6 +672,7 @@ class MiniCPMVBaseModel(nn.Module):
|
||||
*,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
# All MiniCPM-V models disable `tie_word_embeddings` but
|
||||
@@ -669,8 +682,12 @@ class MiniCPMVBaseModel(nn.Module):
|
||||
self.config = config
|
||||
|
||||
self.version = get_version_by_config(self.config)
|
||||
self.llm = self.init_llm(config=config, quant_config=quant_config)
|
||||
self.vpm = self.init_vision_module(config, quant_config)
|
||||
self.llm = self.init_llm(
|
||||
config=config, quant_config=quant_config, prefix=add_prefix("llm", prefix)
|
||||
)
|
||||
self.vpm = self.init_vision_module(
|
||||
config, quant_config, add_prefix("vpm", prefix)
|
||||
)
|
||||
self.vision_dim = (
|
||||
self.vpm.embed_dim
|
||||
if self.version == (2, 0)
|
||||
@@ -679,7 +696,10 @@ class MiniCPMVBaseModel(nn.Module):
|
||||
self.embed_dim = self.config.hidden_size
|
||||
|
||||
self.resampler = self.init_resampler(
|
||||
self.embed_dim, self.vision_dim, quant_config=quant_config
|
||||
self.embed_dim,
|
||||
self.vision_dim,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("resampler", prefix),
|
||||
)
|
||||
|
||||
self.logits_processor = LogitsProcessor(config)
|
||||
@@ -937,6 +957,7 @@ class MiniCPMVBaseModel(nn.Module):
|
||||
self,
|
||||
config: Qwen2Config,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -944,6 +965,7 @@ class MiniCPMVBaseModel(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig],
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -952,6 +974,7 @@ class MiniCPMVBaseModel(nn.Module):
|
||||
embed_dim: int,
|
||||
vision_dim: int,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1011,24 +1034,27 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__(config=config, quant_config=quant_config)
|
||||
super().__init__(config=config, quant_config=quant_config, prefix=prefix)
|
||||
assert self.version == (2, 6)
|
||||
|
||||
def init_llm(
|
||||
self,
|
||||
config: Qwen2Config,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
return Qwen2ForCausalLM(config=config, quant_config=quant_config)
|
||||
return Qwen2ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
|
||||
|
||||
def init_vision_module(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig],
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
model = Idefics2VisionTransformer(
|
||||
config=config.vision_config, quant_config=quant_config
|
||||
config=config.vision_config, quant_config=quant_config, prefix=prefix
|
||||
)
|
||||
if self.config.drop_vision_last_layer:
|
||||
model.encoder.layers = model.encoder.layers[:-1]
|
||||
@@ -1042,6 +1068,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
|
||||
embed_dim: int,
|
||||
vision_dim: int,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> nn.Module:
|
||||
with set_default_torch_dtype(torch.float16):
|
||||
# The resampler in 2.6 remains consistent with the one in 2.5.
|
||||
@@ -1051,6 +1078,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
|
||||
num_heads=embed_dim // 128,
|
||||
kv_dim=vision_dim,
|
||||
quant_config=quant_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
|
||||
return resampler.to(device="cuda", dtype=torch.get_default_dtype())
|
||||
@@ -1207,6 +1235,7 @@ class MiniCPMV:
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
@@ -1221,7 +1250,9 @@ class MiniCPMV:
|
||||
raise ValueError("Currently, MiniCPMV only supports versions 2.6")
|
||||
|
||||
try:
|
||||
minicpmv = instance_class(config=config, quant_config=quant_config)
|
||||
minicpmv = instance_class(
|
||||
config=config, quant_config=quant_config, prefix=prefix
|
||||
)
|
||||
self.minicpmv = minicpmv
|
||||
except Exception as e:
|
||||
print(f"Failed to instantiate MiniCPMV: {e}")
|
||||
|
||||
Reference in New Issue
Block a user