diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index 652086f91..d9d4b0ab2 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -64,7 +64,7 @@ def main(args): @sgl.function def few_shot_gsm8k(s, question): s += few_shot_examples + question - s += sgl.gen("answer", max_tokens=512, stop="Question") + s += sgl.gen("answer", max_tokens=512, stop=["Question", "Assistant:"]) ##################################### ########## SGL Program End ########## diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py index b02ce9f81..d5f16e2ae 100644 --- a/python/sglang/global_config.py +++ b/python/sglang/global_config.py @@ -27,7 +27,7 @@ class GlobalConfig: # Runtime constants: others self.num_continue_decode_steps = 10 self.retract_decode_steps = 20 - self.flashinfer_workspace_size = 192 * 1024 * 1024 + self.flashinfer_workspace_size = 384 * 1024 * 1024 # Output tokenization configs self.skip_special_tokens_in_output = True diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index ed26322c3..3d4e5d4c6 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -120,13 +120,13 @@ class CudaGraphRunner: ) if model_runner.sliding_window_size is None: self.flashinfer_workspace_buffer = ( - self.model_runner.flashinfer_workspace_buffers[0] + self.model_runner.flashinfer_workspace_buffer ) else: - self.flashinfer_workspace_buffers = [ - self.model_runner.flashinfer_workspace_buffers[0], - self.model_runner.flashinfer_workspace_buffers[2], - ] + self.flashinfer_workspace_buffer = ( + self.model_runner.flashinfer_workspace_buffer + ) + self.flashinfer_kv_indptr = [ self.flashinfer_kv_indptr, self.flashinfer_kv_indptr.clone(), @@ -200,7 +200,7 @@ class CudaGraphRunner: for i in range(2): flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[i], + self.flashinfer_workspace_buffer, "NHD", use_cuda_graph=True, use_tensor_cores=use_tensor_cores, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0a7483423..6826bf1a4 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -318,28 +318,26 @@ class ModelRunner: use_tensor_cores = False if self.sliding_window_size is None: - self.flashinfer_workspace_buffers = torch.empty( - 2, + self.flashinfer_workspace_buffer = torch.empty( global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda", ) self.flashinfer_prefill_wrapper_ragged = ( BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[1], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], + self.flashinfer_workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores, ) else: self.flashinfer_workspace_buffers = torch.empty( - 4, global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda", @@ -350,17 +348,17 @@ class ModelRunner: for i in range(2): self.flashinfer_prefill_wrapper_ragged.append( BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 0], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_prefill_wrapper_paged.append( BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 1], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 0], + self.flashinfer_workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores, ) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 973f9c8e1..ae886796c 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -381,7 +381,7 @@ def _set_envs_and_config(server_args: ServerArgs): if not server_args.disable_flashinfer: assert_pkg_version( "flashinfer", - "0.1.4", + "0.1.5", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.",