Fix the default chunked prefill size (#2268)

2024-11-29 16:03:32 -08:00
parent 262e370f78
commit 94e167ea5a
3 changed files with 29 additions and 20 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -118,7 +118,7 @@ class ModelRunner:
            logger.info(
                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
            )
-            server_args.chunked_prefill_size = None
+            server_args.chunked_prefill_size = -1
            self.mem_fraction_static *= 0.95
            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
            if self.model_config.hf_config.architectures == [
@@ -148,12 +148,14 @@ class ModelRunner:

        set_cpu_offload_max_bytes(int(server_args.cpu_offload_gb * 1024**3))

-        # Init components
+        # Get memory before model loading
        min_per_gpu_memory = self.init_torch_distributed()
+
+        # Load the model
        self.sampler = Sampler()
        self.load_model()

-        # Apply torch TP if model supports it
+        # Apply torch TP if the model supports it
        supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
        if self.tp_size > 1 and supports_torch_tp:
            self.apply_torch_tp()
@@ -161,6 +163,7 @@ class ModelRunner:
        else:
            self.torch_tp_applied = False

+        # Init memory pool and attention backends
        if server_args.lora_paths is not None:
            self.init_lora_manager()
        self.init_memory_pool(