diff --git a/examples/frontend_language/usage/llava_video/srt_example_llava_v.py b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py index 266d4c2ba..bc56d4210 100644 --- a/examples/frontend_language/usage/llava_video/srt_example_llava_v.py +++ b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py @@ -208,7 +208,7 @@ if __name__ == "__main__": model_override_args["image_token_index"] = 64002 if args.num_frames == 32: - model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"} + model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"} model_override_args["max_sequence_length"] = 4096 * 2 model_override_args["tokenizer_model_max_length"] = 4096 * 2 elif args.num_frames < 32: diff --git a/python/pyproject.toml b/python/pyproject.toml index 83b73b56f..df6236162 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -26,7 +26,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu "outlines>=0.0.44", "modelscope"] # xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm -srt = ["sglang[runtime_common]", "torch", "vllm==0.5.5"] +srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"] srt_xpu = ["sglang[runtime_common]"] openai = ["openai>=1.0", "tiktoken"] diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py index 6816dcc11..138c2127e 100644 --- a/python/sglang/launch_server_llavavid.py +++ b/python/sglang/launch_server_llavavid.py @@ -14,7 +14,7 @@ if __name__ == "__main__": model_override_args["num_frames"] = 16 model_override_args["model_type"] = "llavavid" if model_override_args["num_frames"] == 32: - model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"} + model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"} model_override_args["max_sequence_length"] = 4096 * 2 model_override_args["tokenizer_model_max_length"] = 4096 * 2 model_override_args["model_max_length"] = 4096 * 2 diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py index e61dccd6a..095164e1a 100644 --- a/python/sglang/srt/layers/linear.py +++ b/python/sglang/srt/layers/linear.py @@ -20,8 +20,10 @@ from vllm.distributed import ( from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.parameter import ( BasevLLMParameter, + PackedColumnParameter, PackedvLLMParameter, PerTensorScaleParameter, + RowvLLMParameter, ) from sglang.srt.layers.quantization.base_config import ( @@ -39,6 +41,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", + "GPTQLinearMethod", ] @@ -50,7 +53,7 @@ def adjust_marlin_shard(param, shard_size, shard_offset): return shard_size * marlin_tile_size, shard_offset * marlin_tile_size -def adjust_bitsandbytes_shard( +def adjust_bitsandbytes_4bit_shard( param: Parameter, qkv_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str ) -> Tuple[int, int]: """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" @@ -207,7 +210,6 @@ class ReplicatedLinear(LinearBase): self.output_size, self.params_dtype, weight_loader=self.weight_loader, - prefix=prefix, ) if bias: @@ -315,7 +317,6 @@ class ColumnParallelLinear(LinearBase): if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader ), - prefix=prefix, ) if bias: self.bias = Parameter( @@ -345,8 +346,12 @@ class ColumnParallelLinear(LinearBase): if is_gguf_weight and isinstance(param, UninitializedParameter): param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + param_data = param.data - if output_dim is not None: + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if output_dim is not None and not use_bitsandbytes_4bit: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -454,17 +459,22 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param.shard_weight_type[loaded_shard_id] = loaded_weight.item() return - if is_gguf_weight and isinstance(param, UninitializedParameter): - from gguf.constants import GGML_QUANT_SIZES + if is_gguf_weight: + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() - ori_shape = param.tensor_shape - weight_types = self.qweight_type.shard_weight_type.values() - row_size = [] - for weight_type in weight_types: - block_size, type_size = GGML_QUANT_SIZES[weight_type] - row_size.append(ori_shape[1] // block_size * type_size) - q_shape = (ori_shape[0], max(row_size)) - param.materialize(q_shape, dtype=loaded_weight.dtype) + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // tp_size + start_idx = tp_rank * shard_size + + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 2: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -526,26 +536,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param, shard_size, shard_offset ) - use_bitsandbytes = getattr(param, "use_bitsandbytes", False) - if use_bitsandbytes: + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + if use_bitsandbytes_4bit: shard_size = loaded_weight.shape[output_dim] shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id - if is_gguf_weight: - tp_size = get_tensor_model_parallel_world_size() - output_dim = getattr(param, "output_dim", None) - shard_shape = list(loaded_weight.shape) - shard_shape[output_dim] = shard_shape[output_dim] // tp_size - param.shard_id.append(loaded_shard_id) - param.shard_size[loaded_shard_id] = shard_shape - - input_dim = getattr(param, "input_dim", None) - input_size = loaded_weight.shape[input_dim] - param_data = param_data.narrow(input_dim, 0, input_size) - param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if not use_bitsandbytes_4bit: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for AQLM codebooks. elif is_metadata: # metadata indicates fixed size concatenated along dim 0 @@ -595,7 +596,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): # If quantized, we need to adjust the offset and size to account # for the packing. if ( - isinstance(param, PackedvLLMParameter) + isinstance(param, (PackedColumnParameter, PackedvLLMParameter)) and param.packed_dim == param.output_dim ): shard_size, shard_offset = param.adjust_shard_indexes_for_packing( @@ -617,7 +618,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): if isinstance(param, PerTensorScaleParameter): param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0) return - elif type(param) is BasevLLMParameter: + elif type(param) in (RowvLLMParameter, BasevLLMParameter): param.load_merged_column_weight(loaded_weight=loaded_weight) return self._load_fused_module_from_checkpoint(param, loaded_weight) @@ -760,7 +761,7 @@ class QKVParallelLinear(ColumnParallelLinear): # If quantized, we need to adjust the offset and size to account # for the packing. if ( - isinstance(param, PackedvLLMParameter) + isinstance(param, (PackedColumnParameter, PackedvLLMParameter)) and param.packed_dim == param.output_dim ): shard_size, shard_offset = param.adjust_shard_indexes_for_packing( @@ -780,10 +781,10 @@ class QKVParallelLinear(ColumnParallelLinear): ): if loaded_shard_id is None: # special case for certain models if isinstance(param, PerTensorScaleParameter): - param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0) + param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0) return - elif type(param) is BasevLLMParameter: - param.load_merged_column_weight(loaded_weight=loaded_weight) + elif type(param) in (RowvLLMParameter, BasevLLMParameter): + param.load_qkv_weight(loaded_weight=loaded_weight) return self._load_fused_module_from_checkpoint(param, loaded_weight) return @@ -818,17 +819,22 @@ class QKVParallelLinear(ColumnParallelLinear): param.shard_weight_type[loaded_shard_id] = loaded_weight.item() return - if is_gguf_weight and isinstance(param, UninitializedParameter): - from gguf.constants import GGML_QUANT_SIZES + if is_gguf_weight: + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() - ori_shape = param.tensor_shape - weight_types = self.qweight_type.shard_weight_type.values() - row_size = [] - for weight_type in weight_types: - block_size, type_size = GGML_QUANT_SIZES[weight_type] - row_size.append(ori_shape[1] // block_size * type_size) - q_shape = (ori_shape[0], max(row_size)) - param.materialize(q_shape, dtype=loaded_weight.dtype) + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // tp_size + start_idx = tp_rank * shard_size + + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 3: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -863,6 +869,8 @@ class QKVParallelLinear(ColumnParallelLinear): self.total_num_kv_heads * self.head_size, ), ] + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + packed_dim = getattr(param, "packed_dim", None) for shard_id, shard_offset, shard_size in shard_offsets: # Special case for Quantized Weights. @@ -877,6 +885,29 @@ class QKVParallelLinear(ColumnParallelLinear): param, shard_size, shard_offset ) + if use_bitsandbytes_4bit: + orig_qkv_offsets = { + "q": (0, self.total_num_heads * self.head_size), + "k": ( + self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + "v": ( + (self.total_num_heads + self.total_num_kv_heads) + * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + "total": ( + (self.total_num_heads + 2 * self.total_num_kv_heads) + * self.head_size, + 0, + ), + } + + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_qkv_offsets, shard_id + ) + loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size ) @@ -910,8 +941,8 @@ class QKVParallelLinear(ColumnParallelLinear): param, shard_size, shard_offset ) - use_bitsandbytes = getattr(param, "use_bitsandbytes", False) - if use_bitsandbytes: + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + if use_bitsandbytes_4bit: orig_qkv_offsets = { "q": (0, self.num_heads * self.head_size), "k": ( @@ -927,29 +958,22 @@ class QKVParallelLinear(ColumnParallelLinear): 0, ), } - shard_size, shard_offset = adjust_bitsandbytes_shard( + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( param, orig_qkv_offsets, loaded_shard_id ) - if is_gguf_weight: - tp_size = get_tensor_model_parallel_world_size() - output_dim = getattr(param, "output_dim", None) - shard_shape = list(loaded_weight.shape) - shard_shape[output_dim] = shard_shape[output_dim] // tp_size - param.shard_id.append(loaded_shard_id) - param.shard_size[loaded_shard_id] = shard_shape - - input_dim = getattr(param, "input_dim", None) - input_size = loaded_weight.shape[input_dim] - param_data = param_data.narrow(input_dim, 0, input_size) - param_data = param_data.narrow(output_dim, shard_offset, shard_size) if loaded_shard_id == "q": shard_id = tp_rank else: shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if not use_bitsandbytes_4bit: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + # Special case for for AQLM codebooks. elif is_metadata: # metadata indicates fixed size concatenated along dim 0 @@ -1037,7 +1061,6 @@ class RowParallelLinear(LinearBase): if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader ), - prefix=prefix, ) if not reduce_results and (bias and not skip_bias_add): raise ValueError( @@ -1061,6 +1084,7 @@ class RowParallelLinear(LinearBase): tp_rank = get_tensor_model_parallel_rank() tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) # Special case for GGUF is_gguf_weight = getattr(param, "is_gguf_weight", False) @@ -1076,7 +1100,9 @@ class RowParallelLinear(LinearBase): param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param_data = param.data - if input_dim is not None: + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if input_dim is not None and not use_bitsandbytes_4bit: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py index 85470996f..849cd1092 100644 --- a/python/sglang/srt/lora/lora.py +++ b/python/sglang/srt/lora/lora.py @@ -351,7 +351,9 @@ class LoRAAdapter(nn.Module): loader = DefaultModelLoader(self.load_config) revision = getattr(self.config.hf_config, "revision", None) for name, loaded_weight in loader._get_weights_iterator( - model_path, revision=revision, fall_back_to_pt=True + DefaultModelLoader.Source( + model_path, revision=revision, fall_back_to_pt=True + ) ): match = re.search(r"layers\.(\d+)\.", name) if match is not None: diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index a8e64205b..d73a5ca03 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -59,8 +59,11 @@ from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( enable_show_time_cost, get_available_gpu_memory, + is_attention_free_model, + is_embedding_model, is_generation_model, is_multimodal_model, + model_has_inner_state, monkey_patch_vllm_dummy_weight_loader, monkey_patch_vllm_p2p_access_check, ) @@ -316,11 +319,13 @@ class ModelRunner: def get_weight_iter(config): iter = loader._get_weights_iterator( - config.model, - config.revision, - fall_back_to_pt=getattr( - self.model, "fall_back_to_pt_during_load", True - ), + DefaultModelLoader.Source( + config.model, + revision=config.revision, + fall_back_to_pt=getattr( + self.model, "fall_back_to_pt_during_load", True + ), + ) ) return iter @@ -662,3 +667,7 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]: # Monkey patch model loader setattr(ModelRegistry, "_try_load_model_cls", load_model_cls_srt) +setattr(ModelRegistry, "is_multimodal_model", is_multimodal_model) +setattr(ModelRegistry, "is_attention_free_model", is_attention_free_model) +setattr(ModelRegistry, "model_has_inner_state", model_has_inner_state) +setattr(ModelRegistry, "is_embedding_model", is_embedding_model) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 8889eed64..f92be4d96 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -250,7 +250,7 @@ class DeepseekV2Attention(nn.Module): bias=False, quant_config=quant_config, ) - rope_scaling["type"] = "deepseek_yarn" + rope_scaling["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, @@ -398,7 +398,7 @@ class DeepseekV2AttentionMLA(nn.Module): bias=False, quant_config=quant_config, ) - rope_scaling["type"] = "deepseek_yarn" + rope_scaling["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 11ce25940..446549b23 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -215,6 +215,26 @@ def is_multimodal_model(model_architectures): return False +def is_attention_free_model(model_architectures): + return False + + +def model_has_inner_state(model_architectures): + return False + + +def is_embedding_model(model_architectures): + if ( + "LlamaEmbeddingModel" in model_architectures + or "MistralModel" in model_architectures + or "LlamaForSequenceClassification" in model_architectures + or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures + ): + return True + else: + return False + + def is_generation_model(model_architectures, is_embedding: bool = False): # We have two ways to determine whether a model is a generative model. # 1. Check the model architectue diff --git a/test/srt/test_server_args.py b/test/srt/test_server_args.py index d8f31ce1b..de5c0f1bc 100644 --- a/test/srt/test_server_args.py +++ b/test/srt/test_server_args.py @@ -11,13 +11,13 @@ class TestPrepareServerArgs(unittest.TestCase): "--model-path", "model_path", "--json-model-override-args", - '{"rope_scaling": {"factor": 2.0, "type": "linear"}}', + '{"rope_scaling": {"factor": 2.0, "rope_type": "linear"}}', ] ) self.assertEqual(server_args.model_path, "model_path") self.assertEqual( json.loads(server_args.json_model_override_args), - {"rope_scaling": {"factor": 2.0, "type": "linear"}}, + {"rope_scaling": {"factor": 2.0, "rope_type": "linear"}}, )