From a385ee27bd0025781eba61578889e470a1c027fb Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 25 Jun 2024 12:46:00 -0700 Subject: [PATCH] Warmup cublas (#566) --- benchmark/gsm8k/README.md | 2 +- benchmark/gsm8k/download_data.sh | 2 ++ benchmark/mmlu/README.md | 3 +-- benchmark/mmlu/download_data.sh | 2 ++ python/sglang/srt/managers/controller/model_runner.py | 10 ++++++++++ python/sglang/srt/managers/controller/tp_worker.py | 2 +- 6 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 benchmark/gsm8k/download_data.sh create mode 100644 benchmark/mmlu/download_data.sh diff --git a/benchmark/gsm8k/README.md b/benchmark/gsm8k/README.md index cb68d269d..a7dc04d9a 100644 --- a/benchmark/gsm8k/README.md +++ b/benchmark/gsm8k/README.md @@ -1,6 +1,6 @@ ## Download data ``` -wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl +bash download_data.sh ``` ## Run benchmark diff --git a/benchmark/gsm8k/download_data.sh b/benchmark/gsm8k/download_data.sh new file mode 100644 index 000000000..a9aa7756d --- /dev/null +++ b/benchmark/gsm8k/download_data.sh @@ -0,0 +1,2 @@ +wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl +wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl \ No newline at end of file diff --git a/benchmark/mmlu/README.md b/benchmark/mmlu/README.md index 9aa01d617..16de20cda 100644 --- a/benchmark/mmlu/README.md +++ b/benchmark/mmlu/README.md @@ -1,7 +1,6 @@ ## Download data ``` -wget https://people.eecs.berkeley.edu/~hendrycks/data.tar -tar xf data.tar +bash download_data.sh ``` ## Run benchmark diff --git a/benchmark/mmlu/download_data.sh b/benchmark/mmlu/download_data.sh new file mode 100644 index 000000000..7a7471776 --- /dev/null +++ b/benchmark/mmlu/download_data.sh @@ -0,0 +1,2 @@ +wget https://people.eecs.berkeley.edu/~hendrycks/data.tar +tar xf data.tar \ No newline at end of file diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index 84ecd98fe..4b4add62b 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -270,6 +270,7 @@ class ModelRunner: # Load the model and create memory pool self.load_model() self.init_memory_pool(total_gpu_memory) + self.init_cublas() self.init_flash_infer() def load_model(self): @@ -346,6 +347,15 @@ class ModelRunner: f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) + def init_cublas(self): + """We need to run a small matmul to init cublas. Otherwise, it will raise some errors later.""" + dtype = torch.float16 + device = "cuda" + a = torch.ones((16, 16), dtype=dtype, device=device) + b = torch.ones((16, 16), dtype=dtype, device=device) + c = a @ b + return c + def init_flash_infer(self): if global_server_args_dict.get("enable_flashinfer", False): from flashinfer import ( diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index 82ddb6e48..61a6a8ea4 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -410,7 +410,7 @@ class ModelTpServer: self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] ) logger.info( - f"[gpu_id={self.gpu_id}] Prefil batch. " + f"[gpu_id={self.gpu_id}] Prefill batch. " f"#new-seq: {len(can_run_list)}, " f"#new-token: {new_batch_input_tokens}, " f"#cached-token: {hit_tokens}, "