diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4aa219d60..5c181570a 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1886,33 +1886,22 @@ class Scheduler: break if self.server_args.enable_dp_attention: - if self.attn_tp_size > 1: - # Sync across attn TP ranks to make sure they have the same number of ready requests - tensor = torch.tensor(num_ready_reqs, dtype=torch.int32) - torch.distributed.all_reduce( - tensor, - op=torch.distributed.ReduceOp.MAX, - group=self.attn_tp_cpu_group, - ) - num_ready_reqs_max = tensor.item() - for i in range(num_ready_reqs, num_ready_reqs_max): - self.grammar_queue[i].grammar = self.grammar_queue[ - i - ].grammar.result() - num_ready_reqs = num_ready_reqs_max + tp_size = self.attn_tp_size + tp_group = self.attn_tp_cpu_group else: - if self.tp_size > 1: - # Sync across TP ranks to make sure they have the same number of ready requests - tensor = torch.tensor(num_ready_reqs, dtype=torch.int32) - torch.distributed.all_reduce( - tensor, op=torch.distributed.ReduceOp.MAX, group=self.tp_cpu_group - ) - num_ready_reqs_max = tensor.item() - for i in range(num_ready_reqs, num_ready_reqs_max): - self.grammar_queue[i].grammar = self.grammar_queue[ - i - ].grammar.result() - num_ready_reqs = num_ready_reqs_max + tp_size = self.tp_size + tp_group = self.tp_cpu_group + + if tp_size > 1: + # Sync across TP ranks to make sure they have the same number of ready requests + tensor = torch.tensor(num_ready_reqs, dtype=torch.int32) + torch.distributed.all_reduce( + tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group + ) + num_ready_reqs_max = tensor.item() + for i in range(num_ready_reqs, num_ready_reqs_max): + self.grammar_queue[i].grammar = self.grammar_queue[i].grammar.result() + num_ready_reqs = num_ready_reqs_max self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs]) self.grammar_queue = self.grammar_queue[num_ready_reqs:] diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index 557216140..eac8cf891 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -31,16 +31,6 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm logger = logging.getLogger(__name__) -def load_token_map(token_map_path: str) -> List[int]: - if not os.path.exists(token_map_path): - cache_dir = snapshot_download( - os.path.dirname(token_map_path), - ignore_patterns=["*.bin", "*.safetensors"], - ) - token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path)) - return torch.load(token_map_path) - - class EAGLEWorker(TpModelWorker): def __init__( @@ -57,6 +47,7 @@ class EAGLEWorker(TpModelWorker): backup_disable_cuda_graph = server_args.disable_cuda_graph server_args.disable_cuda_graph = True + # Load hot token ids if server_args.speculative_token_map is not None: self.hot_token_id = load_token_map(server_args.speculative_token_map) server_args.json_model_override_args = ( @@ -65,6 +56,7 @@ class EAGLEWorker(TpModelWorker): else: self.hot_token_id = None + # Init target worker super().__init__( gpu_id=gpu_id, tp_rank=tp_rank, @@ -88,9 +80,7 @@ class EAGLEWorker(TpModelWorker): embed, head = self.target_worker.model_runner.model.get_embed_and_head() if self.hot_token_id is not None: head = head.clone() - self.hot_token_id = torch.tensor( - self.hot_token_id, dtype=torch.int32, device=head.device - ) + self.hot_token_id = self.hot_token_id.to(head.device) head.data = head.data[self.hot_token_id] self.model_runner.model.set_embed_and_head(embed, head) self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph @@ -369,3 +359,14 @@ class EAGLEWorker(TpModelWorker): ][:req_len] self.model_runner.token_to_kv_pool.free(kv_indices) self.model_runner.req_to_token_pool.free(req.req_pool_idx) + + +def load_token_map(token_map_path: str) -> List[int]: + if not os.path.exists(token_map_path): + cache_dir = snapshot_download( + os.path.dirname(token_map_path), + ignore_patterns=["*.bin", "*.safetensors"], + ) + token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path)) + hot_token_id = torch.load(token_map_path) + return torch.tensor(hot_token_id, dtype=torch.int32) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 5e42ba425..05e4fc558 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -501,6 +501,7 @@ def get_benchmark_args( request_rate=float("inf"), disable_stream=False, disable_ignore_eos=False, + seed: int = 0, pd_seperated: bool = False, ): return SimpleNamespace( @@ -524,7 +525,7 @@ def get_benchmark_args( disable_tqdm=False, disable_stream=disable_stream, return_logprob=False, - seed=0, + seed=seed, disable_ignore_eos=disable_ignore_eos, extra_request_body=None, apply_chat_template=False, @@ -549,6 +550,7 @@ def run_bench_serving( disable_stream=False, disable_ignore_eos=False, need_warmup=False, + seed: int = 0, ): # Launch the server base_url = DEFAULT_URL_FOR_TEST @@ -572,6 +574,7 @@ def run_bench_serving( request_rate=request_rate, disable_stream=disable_stream, disable_ignore_eos=disable_ignore_eos, + seed=seed, ) try: diff --git a/test/srt/models/lora/test_lora_backend.py b/test/srt/models/lora/test_lora_backend.py index b49b47474..08d3494a6 100644 --- a/test/srt/models/lora/test_lora_backend.py +++ b/test/srt/models/lora/test_lora_backend.py @@ -18,7 +18,7 @@ import unittest from typing import List import torch -from utils import * +from utils import BACKENDS, TORCH_DTYPES, LoRAAdaptor, LoRAModelCase from sglang.test.runners import HFRunner, SRTRunner from sglang.test.test_utils import calculate_rouge_l, is_in_ci diff --git a/test/srt/models/lora/test_multi_lora_backend.py b/test/srt/models/lora/test_multi_lora_backend.py index 2be1a9219..8d0047df3 100644 --- a/test/srt/models/lora/test_multi_lora_backend.py +++ b/test/srt/models/lora/test_multi_lora_backend.py @@ -13,15 +13,13 @@ # ============================================================================== import multiprocessing as mp -import os import unittest from typing import List import torch -from utils import * +from utils import BACKENDS, TORCH_DTYPES, LoRAAdaptor, LoRAModelCase -from sglang.test.runners import HFRunner, SRTRunner -from sglang.test.test_utils import calculate_rouge_l, is_in_ci +from sglang.test.test_utils import is_in_ci MULTI_LORA_MODELS = [ LoRAModelCase( diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index d9970a2ec..8f534030f 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -136,8 +136,8 @@ class TestBenchServing(unittest.TestCase): def test_online_latency_eagle(self): res = run_bench_serving( model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, - num_prompts=50, - request_rate=1, + num_prompts=300, + request_rate=8, sharegpt_context_len=3072, disable_ignore_eos=True, dataset_name="sharegpt", @@ -156,6 +156,7 @@ class TestBenchServing(unittest.TestCase): "0.7", ], need_warmup=True, + seed=42, ) if is_in_ci(): @@ -164,8 +165,8 @@ class TestBenchServing(unittest.TestCase): f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n' f'accept_length : {res["accept_length"]:.2f} \n' ) - self.assertLess(res["median_e2e_latency_ms"], 700) - self.assertGreater(res["accept_length"], 2.50) + self.assertLess(res["median_e2e_latency_ms"], 1100) + self.assertGreater(res["accept_length"], 3.0) def test_moe_offline_throughput_default(self): res = run_bench_serving( diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py index 863da34bf..2347c3a1e 100644 --- a/test/srt/test_eagle_infer.py +++ b/test/srt/test_eagle_infer.py @@ -39,7 +39,7 @@ class TestEAGLEEngine(unittest.TestCase): self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"] ref_engine.shutdown() - def test_eagle_accuracy(self): + def test_correctness(self): configs = [ self.BASE_CONFIG, {**self.BASE_CONFIG, "disable_cuda_graph": True}, @@ -95,67 +95,6 @@ class TestEAGLEEngine(unittest.TestCase): print("-" * 40) -class TestEAGLEEngineTokenMap(unittest.TestCase): - BASE_CONFIG = { - "model_path": "meta-llama/Meta-Llama-3-8B-Instruct", - "speculative_draft_model_path": "lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B", - "speculative_algorithm": "EAGLE", - "speculative_num_steps": 5, - "speculative_eagle_topk": 8, - "speculative_num_draft_tokens": 64, - "mem_fraction_static": 0.7, - "cuda_graph_max_bs": 4, - "dtype": "float16", - } - - def setUp(self): - self.prompt = "Today is a sunny day and I like" - self.sampling_params = {"temperature": 0, "max_new_tokens": 8} - - ref_engine = sgl.Engine(model_path=self.BASE_CONFIG["model_path"]) - self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"] - ref_engine.shutdown() - - def test_token_map_accuracy(self): - configs = [ - self.BASE_CONFIG, - { - **self.BASE_CONFIG, - "speculative_token_map": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt", - }, - ] - - for config in configs: - print("testing config: ", config) - with self.subTest(cuda_graph="enabled"): - engine = sgl.Engine(**config) - try: - self._test_basic_generation(engine) - self._test_batch_generation(engine) - finally: - engine.shutdown() - - def _test_basic_generation(self, engine): - output = engine.generate(self.prompt, self.sampling_params)["text"] - print(f"{output=}, {self.ref_output=}") - self.assertEqual(output, self.ref_output) - - def _test_batch_generation(self, engine): - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - params = {"temperature": 0, "max_new_tokens": 30} - - outputs = engine.generate(prompts, params) - for prompt, output in zip(prompts, outputs): - print(f"Prompt: {prompt}") - print(f"Generated: {output['text']}") - print("-" * 40) - - prompts = [ "[INST] <>\\nYou are a helpful assistant.\\n<>\\nToday is a sunny day and I like[/INST]" '[INST] <>\\nYou are a helpful assistant.\\n<>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]', @@ -222,7 +161,7 @@ class TestEAGLEServer(unittest.TestCase): "max_new_tokens": 1024, }, } - # set timeout = 1s,mock disconnected + # set timeout = 1s, mock disconnected requests.post(url, json=data, timeout=1) except Exception as e: print(e) @@ -273,18 +212,71 @@ class TestEAGLEServerTriton(TestEAGLEServer): "--speculative-num-steps", "5", "--speculative-eagle-topk", - "8", + "4", "--speculative-num-draft-tokens", - "64", + "8", "--mem-fraction-static", "0.7", "--attention-backend", "triton", "--cuda-graph-max-bs", - "32", + "16", ], ) +class TestEAGLEEngineTokenMap(unittest.TestCase): + def setUp(self): + self.prompt = "Today is a sunny day and I like" + self.sampling_params = {"temperature": 0, "max_new_tokens": 8} + + ref_engine = sgl.Engine( + model_path="meta-llama/Meta-Llama-3-8B-Instruct", cuda_graph_max_bs=2 + ) + self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"] + ref_engine.shutdown() + + def test_correctness(self): + config = { + "model_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "speculative_draft_model_path": "lmsys/sglang-EAGLE-LLaMA3-Instruct-8B", + "speculative_algorithm": "EAGLE", + "speculative_num_steps": 5, + "speculative_eagle_topk": 4, + "speculative_num_draft_tokens": 8, + "speculative_token_map": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt", + "mem_fraction_static": 0.7, + "cuda_graph_max_bs": 4, + "dtype": "bfloat16", + } + + engine = sgl.Engine(**config) + try: + self._test_basic_generation(engine) + self._test_batch_generation(engine) + finally: + engine.shutdown() + + def _test_basic_generation(self, engine): + output = engine.generate(self.prompt, self.sampling_params)["text"] + print(f"{output=}, {self.ref_output=}") + self.assertEqual(output, self.ref_output) + + def _test_batch_generation(self, engine): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + params = {"temperature": 0, "max_new_tokens": 30} + + outputs = engine.generate(prompts, params) + for prompt, output in zip(prompts, outputs): + print(f"Prompt: {prompt}") + print(f"Generated: {output['text']}") + print("-" * 40) + + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_gguf.py b/test/srt/test_gguf.py index 89572c45f..b4072c775 100644 --- a/test/srt/test_gguf.py +++ b/test/srt/test_gguf.py @@ -15,7 +15,7 @@ class TestGGUF(unittest.TestCase): filename="qwen2-1_5b-instruct-q4_k_m.gguf", ) - engine = sgl.Engine(model_path=model_path, random_seed=42) + engine = sgl.Engine(model_path=model_path, random_seed=42, cuda_graph_max_bs=2) outputs = engine.generate(prompt, sampling_params)["text"] engine.shutdown() diff --git a/test/srt/test_hidden_states.py b/test/srt/test_hidden_states.py index 83fda7756..4c28b3139 100644 --- a/test/srt/test_hidden_states.py +++ b/test/srt/test_hidden_states.py @@ -4,13 +4,13 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer import sglang as sgl -from sglang.test.test_utils import is_in_ci +from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST class TestHiddenState(unittest.TestCase): def test_return_hidden_states(self): prompts = ["Today is", "Today is a sunny day and I like"] - model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct" + model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST tokenizer = AutoTokenizer.from_pretrained(model_path) input_ids = tokenizer(prompts).input_ids @@ -80,7 +80,7 @@ class TestHiddenState(unittest.TestCase): def test_repeatedly_changes_hidden_states(self): prompts = ["Today is", "Today is a sunny day and I like"] - model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct" + model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST tokenizer = AutoTokenizer.from_pretrained(model_path) input_ids = tokenizer(prompts).input_ids diff --git a/test/srt/test_input_embeddings.py b/test/srt/test_input_embeddings.py index bcccf5255..015aabe78 100644 --- a/test/srt/test_input_embeddings.py +++ b/test/srt/test_input_embeddings.py @@ -24,7 +24,7 @@ class TestInputEmbeds(unittest.TestCase): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--disable-radix"], + other_args=["--disable-radix", "--cuda-graph-max-bs", 4], ) cls.texts = [ "The capital of France is", diff --git a/test/srt/test_json_constrained.py b/test/srt/test_json_constrained.py index 6ca9d8ebc..f9295cba2 100644 --- a/test/srt/test_json_constrained.py +++ b/test/srt/test_json_constrained.py @@ -20,7 +20,7 @@ from sglang.test.test_utils import ( ) -def setup_class(cls, backend: str, disable_overlap: bool): +def setup_class(cls, backend: str): cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.json_schema = json.dumps( @@ -42,9 +42,6 @@ def setup_class(cls, backend: str, disable_overlap: bool): backend, ] - if disable_overlap: - other_args += ["--disable-overlap-schedule"] - cls.process = popen_launch_server( cls.model, cls.base_url, @@ -56,7 +53,7 @@ def setup_class(cls, backend: str, disable_overlap: bool): class TestJSONConstrainedOutlinesBackend(unittest.TestCase): @classmethod def setUpClass(cls): - setup_class(cls, backend="outlines", disable_overlap=False) + setup_class(cls, backend="outlines") @classmethod def tearDownClass(cls): @@ -133,5 +130,17 @@ class TestJSONConstrainedOutlinesBackend(unittest.TestCase): list(executor.map(self.run_decode, json_schemas)) +class TestJSONConstrainedXGrammarBackend(TestJSONConstrainedOutlinesBackend): + @classmethod + def setUpClass(cls): + setup_class(cls, backend="xgrammar") + + +class TestJSONConstrainedLLGuidanceBackend(TestJSONConstrainedOutlinesBackend): + @classmethod + def setUpClass(cls): + setup_class(cls, backend="llguidance") + + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py index 6fd3295b5..09b9b5a28 100644 --- a/test/srt/test_metrics.py +++ b/test/srt/test_metrics.py @@ -18,7 +18,7 @@ class TestEnableMetrics(unittest.TestCase): DEFAULT_SMALL_MODEL_NAME_FOR_TEST, DEFAULT_URL_FOR_TEST, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--enable-metrics"], + other_args=["--enable-metrics", "--cuda-graph-max-bs", 2], ) try: diff --git a/test/srt/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py index 88904c55f..4e479c809 100644 --- a/test/srt/test_triton_attention_backend.py +++ b/test/srt/test_triton_attention_backend.py @@ -26,6 +26,8 @@ class TestTritonAttnBackend(unittest.TestCase): "--attention-backend", "triton", "--enable-torch-compile", + "--cuda-graph-max-bs", + 16, ], ) diff --git a/test/srt/test_vertex_endpoint.py b/test/srt/test_vertex_endpoint.py index 728d0d1d2..b20dc8fda 100644 --- a/test/srt/test_vertex_endpoint.py +++ b/test/srt/test_vertex_endpoint.py @@ -24,6 +24,7 @@ class TestVertexEndpoint(unittest.TestCase): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--cuda-graph-max-bs", 2], ) @classmethod