Warmup cublas (#566)
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
## Download data
|
## Download data
|
||||||
```
|
```
|
||||||
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
bash download_data.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run benchmark
|
## Run benchmark
|
||||||
|
|||||||
2
benchmark/gsm8k/download_data.sh
Normal file
2
benchmark/gsm8k/download_data.sh
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl
|
||||||
|
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
## Download data
|
## Download data
|
||||||
```
|
```
|
||||||
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
bash download_data.sh
|
||||||
tar xf data.tar
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run benchmark
|
## Run benchmark
|
||||||
|
|||||||
2
benchmark/mmlu/download_data.sh
Normal file
2
benchmark/mmlu/download_data.sh
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
||||||
|
tar xf data.tar
|
||||||
@@ -270,6 +270,7 @@ class ModelRunner:
|
|||||||
# Load the model and create memory pool
|
# Load the model and create memory pool
|
||||||
self.load_model()
|
self.load_model()
|
||||||
self.init_memory_pool(total_gpu_memory)
|
self.init_memory_pool(total_gpu_memory)
|
||||||
|
self.init_cublas()
|
||||||
self.init_flash_infer()
|
self.init_flash_infer()
|
||||||
|
|
||||||
def load_model(self):
|
def load_model(self):
|
||||||
@@ -346,6 +347,15 @@ class ModelRunner:
|
|||||||
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def init_cublas(self):
|
||||||
|
"""We need to run a small matmul to init cublas. Otherwise, it will raise some errors later."""
|
||||||
|
dtype = torch.float16
|
||||||
|
device = "cuda"
|
||||||
|
a = torch.ones((16, 16), dtype=dtype, device=device)
|
||||||
|
b = torch.ones((16, 16), dtype=dtype, device=device)
|
||||||
|
c = a @ b
|
||||||
|
return c
|
||||||
|
|
||||||
def init_flash_infer(self):
|
def init_flash_infer(self):
|
||||||
if global_server_args_dict.get("enable_flashinfer", False):
|
if global_server_args_dict.get("enable_flashinfer", False):
|
||||||
from flashinfer import (
|
from flashinfer import (
|
||||||
|
|||||||
@@ -410,7 +410,7 @@ class ModelTpServer:
|
|||||||
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[gpu_id={self.gpu_id}] Prefil batch. "
|
f"[gpu_id={self.gpu_id}] Prefill batch. "
|
||||||
f"#new-seq: {len(can_run_list)}, "
|
f"#new-seq: {len(can_run_list)}, "
|
||||||
f"#new-token: {new_batch_input_tokens}, "
|
f"#new-token: {new_batch_input_tokens}, "
|
||||||
f"#cached-token: {hit_tokens}, "
|
f"#cached-token: {hit_tokens}, "
|
||||||
|
|||||||
Reference in New Issue
Block a user