Fix model loader for more quantization formats (#2448)
This commit is contained in:
@@ -294,6 +294,28 @@ class LlamaModel(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class LlamaForCausalLM(nn.Module):
|
class LlamaForCausalLM(nn.Module):
|
||||||
|
|
||||||
|
# BitandBytes specific attributes
|
||||||
|
default_bitsandbytes_target_modules = [
|
||||||
|
".gate_proj.",
|
||||||
|
".down_proj.",
|
||||||
|
".up_proj.",
|
||||||
|
".q_proj.",
|
||||||
|
".k_proj.",
|
||||||
|
".v_proj.",
|
||||||
|
".o_proj.",
|
||||||
|
]
|
||||||
|
# in TP, these weights are partitioned along the column dimension (dim=-1)
|
||||||
|
column_parallel_weights_modules = [".down_proj.", ".o_proj."]
|
||||||
|
bitsandbytes_stacked_params_mapping = {
|
||||||
|
# shard_name, weight_name, index
|
||||||
|
"q_proj": ("qkv_proj", 0),
|
||||||
|
"k_proj": ("qkv_proj", 1),
|
||||||
|
"v_proj": ("qkv_proj", 2),
|
||||||
|
"gate_proj": ("gate_up_proj", 0),
|
||||||
|
"up_proj": ("gate_up_proj", 1),
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: LlamaConfig,
|
config: LlamaConfig,
|
||||||
|
|||||||
@@ -267,6 +267,26 @@ class Qwen2Model(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Qwen2ForCausalLM(nn.Module):
|
class Qwen2ForCausalLM(nn.Module):
|
||||||
|
|
||||||
|
# BitandBytes specific attributes
|
||||||
|
default_bitsandbytes_target_modules = [
|
||||||
|
".gate_proj.",
|
||||||
|
".down_proj.",
|
||||||
|
".up_proj.",
|
||||||
|
".q_proj.",
|
||||||
|
".k_proj.",
|
||||||
|
".v_proj.",
|
||||||
|
".o_proj.",
|
||||||
|
]
|
||||||
|
bitsandbytes_stacked_params_mapping = {
|
||||||
|
# shard_name, weight_name, index
|
||||||
|
"q_proj": ("qkv_proj", 0),
|
||||||
|
"k_proj": ("qkv_proj", 1),
|
||||||
|
"v_proj": ("qkv_proj", 2),
|
||||||
|
"gate_proj": ("gate_up_proj", 0),
|
||||||
|
"up_proj": ("gate_up_proj", 1),
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: Qwen2Config,
|
config: Qwen2Config,
|
||||||
|
|||||||
@@ -283,7 +283,15 @@ class ServerArgs:
|
|||||||
"--load-format",
|
"--load-format",
|
||||||
type=str,
|
type=str,
|
||||||
default=ServerArgs.load_format,
|
default=ServerArgs.load_format,
|
||||||
choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
|
choices=[
|
||||||
|
"auto",
|
||||||
|
"pt",
|
||||||
|
"safetensors",
|
||||||
|
"npcache",
|
||||||
|
"dummy",
|
||||||
|
"gguf",
|
||||||
|
"bitsandbytes",
|
||||||
|
],
|
||||||
help="The format of the model weights to load. "
|
help="The format of the model weights to load. "
|
||||||
'"auto" will try to load the weights in the safetensors format '
|
'"auto" will try to load the weights in the safetensors format '
|
||||||
"and fall back to the pytorch bin format if safetensors format "
|
"and fall back to the pytorch bin format if safetensors format "
|
||||||
@@ -294,7 +302,9 @@ class ServerArgs:
|
|||||||
"a numpy cache to speed up the loading. "
|
"a numpy cache to speed up the loading. "
|
||||||
'"dummy" will initialize the weights with random values, '
|
'"dummy" will initialize the weights with random values, '
|
||||||
"which is mainly for profiling."
|
"which is mainly for profiling."
|
||||||
'"gguf" will load the weights in the gguf format. ',
|
'"gguf" will load the weights in the gguf format. '
|
||||||
|
'"bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
"quantization.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
|
|||||||
Reference in New Issue
Block a user