From a385ee27bd0025781eba61578889e470a1c027fb Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 25 Jun 2024 12:46:00 -0700
Subject: [PATCH] Warmup cublas (#566)

---
 benchmark/gsm8k/README.md                             |  2 +-
 benchmark/gsm8k/download_data.sh                      |  2 ++
 benchmark/mmlu/README.md                              |  3 +--
 benchmark/mmlu/download_data.sh                       |  2 ++
 python/sglang/srt/managers/controller/model_runner.py | 10 ++++++++++
 python/sglang/srt/managers/controller/tp_worker.py    |  2 +-
 6 files changed, 17 insertions(+), 4 deletions(-)
 create mode 100644 benchmark/gsm8k/download_data.sh
 create mode 100644 benchmark/mmlu/download_data.sh

diff --git a/benchmark/gsm8k/README.md b/benchmark/gsm8k/README.md
index cb68d269d..a7dc04d9a 100644
--- a/benchmark/gsm8k/README.md
+++ b/benchmark/gsm8k/README.md
@@ -1,6 +1,6 @@
 ## Download data
 ```
-wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+bash download_data.sh
 ```
 
 ## Run benchmark
diff --git a/benchmark/gsm8k/download_data.sh b/benchmark/gsm8k/download_data.sh
new file mode 100644
index 000000000..a9aa7756d
--- /dev/null
+++ b/benchmark/gsm8k/download_data.sh
@@ -0,0 +1,2 @@
+wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl
+wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
\ No newline at end of file
diff --git a/benchmark/mmlu/README.md b/benchmark/mmlu/README.md
index 9aa01d617..16de20cda 100644
--- a/benchmark/mmlu/README.md
+++ b/benchmark/mmlu/README.md
@@ -1,7 +1,6 @@
 ## Download data
 ```
-wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
-tar xf data.tar
+bash download_data.sh
 ```
 
 ## Run benchmark
diff --git a/benchmark/mmlu/download_data.sh b/benchmark/mmlu/download_data.sh
new file mode 100644
index 000000000..7a7471776
--- /dev/null
+++ b/benchmark/mmlu/download_data.sh
@@ -0,0 +1,2 @@
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+tar xf data.tar
\ No newline at end of file
diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py
index 84ecd98fe..4b4add62b 100644
--- a/python/sglang/srt/managers/controller/model_runner.py
+++ b/python/sglang/srt/managers/controller/model_runner.py
@@ -270,6 +270,7 @@ class ModelRunner:
         # Load the model and create memory pool
         self.load_model()
         self.init_memory_pool(total_gpu_memory)
+        self.init_cublas()
         self.init_flash_infer()
 
     def load_model(self):
@@ -346,6 +347,15 @@ class ModelRunner:
             f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
         )
 
+    def init_cublas(self):
+        """We need to run a small matmul to init cublas. Otherwise, it will raise some errors later."""
+        dtype = torch.float16
+        device = "cuda"
+        a = torch.ones((16, 16), dtype=dtype, device=device)
+        b = torch.ones((16, 16), dtype=dtype, device=device)
+        c = a @ b
+        return c
+
     def init_flash_infer(self):
         if global_server_args_dict.get("enable_flashinfer", False):
             from flashinfer import (
diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py
index 82ddb6e48..61a6a8ea4 100644
--- a/python/sglang/srt/managers/controller/tp_worker.py
+++ b/python/sglang/srt/managers/controller/tp_worker.py
@@ -410,7 +410,7 @@ class ModelTpServer:
                 self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
             )
             logger.info(
-                f"[gpu_id={self.gpu_id}] Prefil batch. "
+                f"[gpu_id={self.gpu_id}] Prefill batch. "
                 f"#new-seq: {len(can_run_list)}, "
                 f"#new-token: {new_batch_input_tokens}, "
                 f"#cached-token: {hit_tokens}, "