From 621e96bf9b28b084e29f3e285b66c9ab8ea8608e Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 27 Apr 2025 07:18:10 -0700 Subject: [PATCH] [CI] Fix ci tests (#5769) --- python/sglang/bench_one_batch.py | 6 + .../sglang/srt/layers/moe/fused_moe_native.py | 6 +- .../layers/moe/fused_moe_triton/fused_moe.py | 26 ++--- .../sglang/srt/model_executor/model_runner.py | 20 ++-- python/sglang/srt/models/llama4.py | 1 - python/sglang/srt/server_args.py | 34 ++---- .../sglang/srt/torch_memory_saver_adapter.py | 11 +- python/sglang/srt/utils.py | 2 +- python/sglang/test/test_utils.py | 51 +++++---- test/srt/run_suite.py | 44 ++++---- test/srt/test_bench_one_batch.py | 6 +- test/srt/test_bench_serving.py | 20 ++-- test/srt/test_eval_fp8_accuracy.py | 10 +- test/srt/test_fa3.py | 2 +- test/srt/test_modelopt.py | 8 +- test/srt/test_nightly_gsm8k_eval.py | 22 +--- test/srt/test_nightly_human_eval.py | 105 ------------------ test/srt/test_nightly_math_eval.py | 47 -------- 18 files changed, 126 insertions(+), 295 deletions(-) delete mode 100644 test/srt/test_nightly_human_eval.py delete mode 100644 test/srt/test_nightly_math_eval.py diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py index fb7343532..cdf8e9ea3 100644 --- a/python/sglang/bench_one_batch.py +++ b/python/sglang/bench_one_batch.py @@ -57,6 +57,7 @@ import torch import torch.distributed as dist from sglang.srt.configs.model_config import ModelConfig +from sglang.srt.distributed.parallel_state import destroy_distributed_environment from sglang.srt.entrypoints.engine import _set_envs_and_config from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.managers.schedule_batch import Req, ScheduleBatch @@ -502,8 +503,13 @@ def latency_test( for result in result_list: fout.write(json.dumps(result) + "\n") + if server_args.tp_size > 1: + destroy_distributed_environment() + def main(server_args, bench_args): + server_args.cuda_graph_max_bs = max(bench_args.batch_size) + _set_envs_and_config(server_args) if server_args.model_path: diff --git a/python/sglang/srt/layers/moe/fused_moe_native.py b/python/sglang/srt/layers/moe/fused_moe_native.py index ce9940e1a..8cfbf6f86 100644 --- a/python/sglang/srt/layers/moe/fused_moe_native.py +++ b/python/sglang/srt/layers/moe/fused_moe_native.py @@ -8,6 +8,7 @@ from typing import Callable, Optional import torch from torch.nn import functional as F +from sglang.srt.layers.activation import GeluAndMul, SiluAndMul from sglang.srt.layers.moe.topk import select_experts @@ -30,7 +31,7 @@ def fused_moe_forward_native( ) -> torch.Tensor: if apply_router_weight_on_input: - raise NotImplementedError + raise NotImplementedError() topk_weights, topk_ids = select_experts( hidden_states=x, @@ -75,9 +76,6 @@ def moe_forward_native( activation: str = "silu", routed_scaling_factor: Optional[float] = None, ) -> torch.Tensor: - - from sglang.srt.layers.activation import GeluAndMul, SiluAndMul - topk_weights, topk_ids = select_experts( hidden_states=x, router_logits=router_logits, diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index eca92b151..da541f542 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -13,7 +13,16 @@ import triton import triton.language as tl from sglang.srt.layers.moe.topk import select_experts -from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant +from sglang.srt.layers.quantization.fp8_kernel import ( + per_token_group_quant_fp8, + scaled_fp8_quant, + sglang_per_token_group_quant_fp8, +) +from sglang.srt.layers.quantization.int8_kernel import ( + per_token_group_quant_int8, + per_token_quant_int8, + sglang_per_token_group_quant_int8, +) from sglang.srt.utils import ( direct_register_custom_op, get_bool_env_var, @@ -746,21 +755,6 @@ def invoke_fused_moe_kernel( block_shape: Optional[List[int]] = None, no_combine: bool = False, ) -> None: - from sglang.srt.layers.quantization.int8_kernel import ( - per_token_group_quant_int8, - per_token_quant_int8, - ) - - if _is_cuda: - from sglang.srt.layers.quantization.fp8_kernel import ( - sglang_per_token_group_quant_fp8, - ) - from sglang.srt.layers.quantization.int8_kernel import ( - sglang_per_token_group_quant_int8, - ) - else: - from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 - assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 735d67fb2..60ab0f36f 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -91,11 +91,14 @@ from sglang.srt.utils import ( set_cuda_arch, ) -logger = logging.getLogger(__name__) - +# Use a small KV cache pool size for tests in CI SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None) + +# Detect stragger ranks in model loading UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300 +logger = logging.getLogger(__name__) + class ModelRunner: """ModelRunner runs the forward passes of the models.""" @@ -177,7 +180,7 @@ class ModelRunner: if _ENABLE_JIT_DEEPGEMM: update_deep_gemm_config(gpu_id, server_args) - # If it is a draft model tp_group can be different. + # If it is a draft model, tp_group can be different self.initialize(min_per_gpu_memory) def initialize(self, min_per_gpu_memory: float): @@ -230,7 +233,8 @@ class ModelRunner: if server_args.attention_backend is None: """ - We auto select the fastest attention backend according to the current offering + Auto select the fastest attention backend. + 1. Models with MHA Architecture (e.g: Llama, QWen) 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1. 1.2 In other cases, we will use flashinfer if available, otherwise use triton. @@ -240,6 +244,7 @@ class ModelRunner: """ if not self.use_mla_backend: + # MHA architecture if ( is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(server_args) @@ -251,6 +256,7 @@ class ModelRunner: "flashinfer" if is_flashinfer_available() else "triton" ) else: + # MLA architecture if is_hopper_with_cuda_12_3(): server_args.attention_backend = "fa3" else: @@ -259,7 +265,6 @@ class ModelRunner: f"Attention backend not set. Use {server_args.attention_backend} backend by default." ) elif self.use_mla_backend: - # TODO: add MLA optimization on CPU if server_args.device != "cpu": if server_args.attention_backend in [ "flashinfer", @@ -275,7 +280,7 @@ class ModelRunner: f"Invalid attention backend for MLA: {server_args.attention_backend}" ) else: - raise ValueError(f"MLA optimization not supported on CPU.") + raise ValueError("MLA optimization not supported on CPU.") if ( server_args.attention_backend == "fa3" @@ -310,9 +315,6 @@ class ModelRunner: ) server_args.chunked_prefill_size = -1 - if server_args.enable_deepep_moe: - logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}") - if not self.use_mla_backend: server_args.disable_chunked_prefix_cache = True elif self.page_size > 1: diff --git a/python/sglang/srt/models/llama4.py b/python/sglang/srt/models/llama4.py index 88c3716f7..95edfa40e 100644 --- a/python/sglang/srt/models/llama4.py +++ b/python/sglang/srt/models/llama4.py @@ -260,7 +260,6 @@ class Llama4Attention(nn.Module): if self.rotary_emb is not None: q_view, k_view = qk.split([self.q_size, self.kv_size], dim=-1) q_out_unused, k_out_unused = self.rotary_emb(positions, q_view, k_view) - assert (q_out_unused is q_view) and (k_out_unused is k_view) del q_view, k_view, q_out_unused, k_out_unused if self.qk_norm is not None: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 6e81e2ba9..0371e1c52 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -201,7 +201,7 @@ class ServerArgs: # Expert parallelism if self.enable_ep_moe: self.ep_size = self.tp_size - logger.info( + logger.warning( f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." ) @@ -243,19 +243,19 @@ class ServerArgs: self.chunked_prefill_size = 2048 else: self.chunked_prefill_size = 8192 - assert self.chunked_prefill_size % self.page_size == 0 assert self.moe_dense_tp_size in { 1, None, - }, f"moe_dense_tp_size only support 1 and None currently" + }, "moe_dense_tp_size only support 1 and None currently" if self.attention_backend == "flashmla": logger.warning( "FlashMLA only supports a page_size of 64, change page_size to 64." ) self.page_size = 64 + # Set cuda graph max batch size if self.cuda_graph_max_bs is None: # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. @@ -270,6 +270,7 @@ class ServerArgs: self.attention_backend = "torch_native" self.sampling_backend = "pytorch" + # Set kernel backends if self.sampling_backend is None: self.sampling_backend = ( "flashinfer" if is_flashinfer_available() else "pytorch" @@ -297,8 +298,8 @@ class ServerArgs: f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. " ) - self.enable_sp_layernorm = False # DeepEP MoE + self.enable_sp_layernorm = False if self.enable_deepep_moe: if self.deepep_mode == "auto": assert ( @@ -308,7 +309,7 @@ class ServerArgs: self.enable_sp_layernorm = ( self.dp_size < self.tp_size if self.enable_dp_attention else True ) - logger.info( + logger.warning( f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." ) @@ -317,14 +318,11 @@ class ServerArgs: # NEXTN shares the same implementation of EAGLE self.speculative_algorithm = "EAGLE" - if ( - self.speculative_algorithm == "EAGLE" - or self.speculative_algorithm == "EAGLE3" - ): + if self.speculative_algorithm in ("EAGLE", "EAGLE3"): if self.max_running_requests is None: self.max_running_requests = 48 self.disable_overlap_schedule = True - logger.info( + logger.warning( "Overlap scheduler is disabled because of using " "eagle speculative decoding." ) @@ -343,7 +341,7 @@ class ServerArgs: if self.page_size > 1 and self.speculative_eagle_topk > 1: self.speculative_eagle_topk = 1 - logger.info( + logger.warning( "speculative_eagle_topk is adjusted to 1 when page_size > 1" ) @@ -351,7 +349,7 @@ class ServerArgs: self.speculative_eagle_topk == 1 and self.speculative_num_draft_tokens != self.speculative_num_steps + 1 ): - logger.info( + logger.warning( "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1" ) self.speculative_num_draft_tokens = self.speculative_num_steps + 1 @@ -381,18 +379,6 @@ class ServerArgs: self.disable_radix_cache = True logger.warning("KV cache is forced as chunk cache for decode server") - if self.enable_memory_saver: - try: - import torch_memory_saver - except ImportError: - logger.warning( - "enable_memory_saver is enabled, but " - "torch-memory-saver is not installed. Please install it " - "via `pip3 uninstall torch-memory-saver`. " - "For normal operation, it will be disabled." - ) - raise - os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = ( "1" if self.enable_torch_compile else "0" ) diff --git a/python/sglang/srt/torch_memory_saver_adapter.py b/python/sglang/srt/torch_memory_saver_adapter.py index 4fd0611e1..2b1080d25 100644 --- a/python/sglang/srt/torch_memory_saver_adapter.py +++ b/python/sglang/srt/torch_memory_saver_adapter.py @@ -6,7 +6,9 @@ try: import torch_memory_saver _primary_memory_saver = torch_memory_saver.TorchMemorySaver() -except ImportError: + import_error = None +except ImportError as e: + import_error = e pass logger = logging.getLogger(__name__) @@ -15,6 +17,13 @@ logger = logging.getLogger(__name__) class TorchMemorySaverAdapter(ABC): @staticmethod def create(enable: bool): + if enable and import_error is not None: + logger.warning( + "enable_memory_saver is enabled, but " + "torch-memory-saver is not installed. Please install it " + "via `pip3 install torch-memory-saver`. " + ) + raise import_error return ( _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop() ) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index ba6bb6140..a15c2f5b0 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str: s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable return s.getsockname()[0] except Exception: - raise ValueError(f"Can not get local ip") + raise ValueError("Can not get local ip") def is_page_size_one(server_args): diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 7e1889f39..322ee77c3 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -33,33 +33,44 @@ from sglang.srt.utils import ( from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback -DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8" -DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" -DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = ( +# General test models +DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" +DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" +DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" +DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B" + +# MLA test models +DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" +DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" +DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test" +DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN" + +# FP8 models +DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8" +DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8" +DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = ( "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" ) -DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = ( +DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = ( "nvidia/Llama-3.1-8B-Instruct-FP8" ) -DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" +# EAGLE +DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf" +DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B" DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B" -DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test" -DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN" -DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" + +# Other use cases DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = ( "meta-llama/Llama-4-Scout-17B-16E-Instruct" ) -DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" -DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B" DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" -DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" -DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = ( "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" ) -DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000 + +# Nightly tests DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" @@ -68,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct" DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B" -DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf" -DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B" - DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true" DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" +DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000 + def is_in_ci(): """Return whether it is in CI runner.""" @@ -499,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): tic = time.time() success = True - for file in files: + for i, file in enumerate(files): filename, estimated_time = file.name, file.estimated_time process = None @@ -507,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): nonlocal process filename = os.path.join(os.getcwd(), filename) - print(f".\n.\nBegin:\npython3 {filename}\n.\n.\n", flush=True) + print( + f".\n.\nBegin ({i}/{len(files)}):\npython3 {filename}\n.\n.\n", + flush=True, + ) tic = time.time() process = subprocess.Popen( @@ -517,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): elapsed = time.time() - tic print( - f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n", + f".\n.\nEnd ({i}/{len(files)}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n", flush=True, ) return process.returncode diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 8da966d84..2d9ca8f11 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -16,23 +16,29 @@ suites = { TestFile("models/lora/test_lora.py", 76), TestFile("models/lora/test_lora_backend.py", 99), TestFile("models/lora/test_multi_lora_backend.py", 60), - TestFile("models/test_embedding_models.py", 35), + TestFile("models/test_embedding_models.py", 184), + TestFile("models/test_clip_models.py", 52), + TestFile("models/test_compressed_tensors_models.py", 42), TestFile("models/test_generation_models.py", 103), + TestFile("models/test_gme_qwen_models.py", 45), # TestFile("models/test_grok_models.py", 60), # Disabled due to illegal memory access TestFile("models/test_qwen_models.py", 82), - TestFile("models/test_compressed_tensors_models.py", 100), - TestFile("models/test_reward_models.py", 83), - TestFile("models/test_gme_qwen_models.py", 45), - TestFile("models/test_clip_models.py", 52), - TestFile("models/test_vlm_models.py", 581), + TestFile("models/test_reward_models.py", 132), + TestFile("models/test_vlm_models.py", 317), TestFile("test_abort.py", 51), TestFile("test_block_int8.py", 22), + TestFile("test_create_kvindices.py", 2), TestFile("test_chunked_prefill.py", 285), TestFile("test_eagle_infer.py", 584), - TestFile("test_ebnf_constrained.py"), - TestFile("test_fa3.py", 376), - TestFile("test_fp8_kernel.py", 8), + TestFile("test_ebnf_constrained.py", 108), TestFile("test_embedding_openai_server.py", 141), + TestFile("test_eval_fp8_accuracy.py", 303), + TestFile("test_fa3.py", 376), + TestFile("test_fim_completion.py", 40), + TestFile("test_fp8_kernel.py", 8), + TestFile("test_fused_moe.py", 30), + TestFile("test_hicache.py", 116), + TestFile("test_hicache_mla.py", 254), TestFile("test_hidden_states.py", 55), TestFile("test_int8_kernel.py", 8), TestFile("test_input_embeddings.py", 38), @@ -41,11 +47,11 @@ suites = { TestFile("test_metrics.py", 32), TestFile("test_mla.py", 242), TestFile("test_mla_deepseek_v3.py", 221), - TestFile("test_mla_int8_deepseek_v3.py", 674), + TestFile("test_mla_int8_deepseek_v3.py", 389), TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_fp8.py", 153), - TestFile("test_no_chunked_prefill.py", 126), - TestFile("test_no_overlap_scheduler.py", 262), + TestFile("test_no_chunked_prefill.py", 108), + TestFile("test_no_overlap_scheduler.py", 216), TestFile("test_openai_server.py", 149), TestFile("test_penalty.py", 41), TestFile("test_page_size.py", 60), @@ -59,27 +65,21 @@ suites = { TestFile("test_server_args.py", 1), TestFile("test_skip_tokenizer_init.py", 117), TestFile("test_srt_engine.py", 237), - TestFile("test_srt_endpoint.py", 94), + TestFile("test_srt_endpoint.py", 130), TestFile("test_torch_compile.py", 76), - TestFile("test_torch_compile_moe.py", 235), + TestFile("test_torch_compile_moe.py", 172), TestFile("test_torch_native_attention_backend.py", 123), TestFile("test_torchao.py", 70), TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_attention_backend.py", 134), + TestFile("test_triton_moe_channel_fp8_kernel.py", 25), TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_vertex_endpoint.py", 31), - TestFile("test_vision_chunked_prefill.py", 119), + TestFile("test_vision_chunked_prefill.py", 175), TestFile("test_vlm_accuracy.py", 60), TestFile("test_vision_openai_server.py", 637), - TestFile("test_fim_completion.py", 40), TestFile("test_w8a8_quantization.py", 46), - TestFile("test_eval_fp8_accuracy.py", 303), - TestFile("test_create_kvindices.py", 2), - TestFile("test_hicache.py", 116), - TestFile("test_hicache_mla.py", 254), - TestFile("test_fused_moe.py", 30), - TestFile("test_triton_moe_channel_fp8_kernel.py", 25), ], "per-commit-2-gpu": [ TestFile("models/lora/test_lora_tp.py", 116), diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index 098e2df9f..9f5a56566 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -29,13 +29,9 @@ class TestBenchOneBatch(CustomTestCase): DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"] ) - use_vllm_custom_allreduce = get_bool_env_var( - "USE_VLLM_CUSTOM_ALLREDUCE", default="false" - ) - if is_in_ci(): write_github_step_summary( - f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n" + f"### test_moe_tp2_bs1\n" f"output_throughput : {output_throughput:.2f} token/s\n" ) self.assertGreater(output_throughput, 124) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 811b5d739..821b23ebe 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -3,8 +3,8 @@ import unittest from sglang.test.test_utils import ( DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, - DEFAULT_FP8_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_MODEL_NAME_FOR_TEST_FP8, DEFAULT_MOE_MODEL_NAME_FOR_TEST, CustomTestCase, is_in_ci, @@ -28,7 +28,7 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_default\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - self.assertGreater(res["output_throughput"], 3350) + self.assertGreater(res["output_throughput"], 3800) def test_offline_throughput_non_stream_small_batch_size(self): res = run_bench_serving( @@ -48,9 +48,7 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_non_stream_small_batch_size\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - # There is a regression with torch 2.5 - # This number was 950 for torch 2.4 - self.assertGreater(res["output_throughput"], 1000) + self.assertGreater(res["output_throughput"], 1050) def test_offline_throughput_without_radix_cache(self): res = run_bench_serving( @@ -65,7 +63,7 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_without_radix_cache\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - self.assertGreater(res["output_throughput"], 3350) + self.assertGreater(res["output_throughput"], 3800) def test_offline_throughput_without_chunked_prefill(self): res = run_bench_serving( @@ -100,11 +98,11 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_with_triton_attention_backend\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - self.assertGreater(res["output_throughput"], 3450) + self.assertGreater(res["output_throughput"], 3600) def test_offline_throughput_default_fp8(self): res = run_bench_serving( - model=DEFAULT_FP8_MODEL_NAME_FOR_TEST, + model=DEFAULT_MODEL_NAME_FOR_TEST_FP8, num_prompts=500, request_rate=float("inf"), other_server_args=[], @@ -115,7 +113,7 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_default_fp8\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - self.assertGreater(res["output_throughput"], 3900) + self.assertGreater(res["output_throughput"], 4200) def test_online_latency_default(self): res = run_bench_serving( @@ -166,8 +164,8 @@ class TestBenchServing(CustomTestCase): f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n' f'accept_length : {res["accept_length"]:.2f} \n' ) - self.assertLess(res["median_e2e_latency_ms"], 900) - self.assertGreater(res["accept_length"], 2.99) + self.assertLess(res["median_e2e_latency_ms"], 800) + self.assertGreater(res["accept_length"], 3.0) def test_moe_offline_throughput_default(self): res = run_bench_serving( diff --git a/test/srt/test_eval_fp8_accuracy.py b/test/srt/test_eval_fp8_accuracy.py index 80448f03e..329e2dad8 100644 --- a/test/srt/test_eval_fp8_accuracy.py +++ b/test/srt/test_eval_fp8_accuracy.py @@ -4,8 +4,8 @@ from types import SimpleNamespace from sglang.srt.utils import is_hip, kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( - DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST, - DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, + DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8, + DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8, DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, @@ -17,7 +17,7 @@ from sglang.test.test_utils import ( class TestEvalFP8Accuracy(CustomTestCase): @classmethod def setUpClass(cls): - cls.model = DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST + cls.model = DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH @@ -76,7 +76,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase): def test_mmlu_offline_only(self): """Test with offline quantization only.""" self._run_test( - model=DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, + model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8, other_args=[], expected_score=0.64, ) @@ -84,7 +84,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase): def test_mmlu_offline_and_online_override(self): """Test with both offline and online quantization.""" self._run_test( - model=DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, + model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8, other_args=["--quantization", "w8a8_fp8"], # inference will use sgl kernel w/ online quant override # we observed that the accuracy is higher then offline only diff --git a/test/srt/test_fa3.py b/test/srt/test_fa3.py index 1dd3fc65c..6946dd4f3 100644 --- a/test/srt/test_fa3.py +++ b/test/srt/test_fa3.py @@ -48,7 +48,7 @@ if OFFLINE_MODE: DEFAULT_SERVER_ARGS = [ "--trust-remote-code", "--cuda-graph-max-bs", - "4", + "8", "--attention-backend", "fa3", ] diff --git a/test/srt/test_modelopt.py b/test/srt/test_modelopt.py index 166af22a5..ef6a959ec 100644 --- a/test/srt/test_modelopt.py +++ b/test/srt/test_modelopt.py @@ -6,8 +6,8 @@ import torch from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( - DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST, - DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION, + DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8, + DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, @@ -49,10 +49,10 @@ class TestEvalFP8ModelOptQuantAccuracy(CustomTestCase): def test_mmlu_offline_only(self): """Test with offline quantization only.""" self._run_test( - model=DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST, + model=DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8, other_args=[ "--revision", - DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION, + DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION, ], expected_score=0.64, ) diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 7232a1274..8de103b52 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -14,7 +14,6 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, - CustomTestCase, is_in_ci, popen_launch_server, write_github_step_summary, @@ -45,25 +44,10 @@ def parse_models(model_string): return [model.strip() for model in model_string.split(",") if model.strip()] -def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2): +def popen_launch_server_wrapper(base_url, model, is_tp2): other_args = ["--log-level-http", "warning", "--trust-remote-code"] - if is_fp8: - if "Llama-3" in model or "gemma-2" in model: - other_args.extend(["--kv-cache-dtype", "fp8_e5m2"]) - elif "Qwen2-72B-Instruct-FP8" in model: - other_args.extend(["--quantization", "fp8"]) - elif "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" in model: - other_args.extend([]) - else: - other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]) if is_tp2: other_args.extend(["--tp", "2"]) - if "DeepSeek" in model: - other_args.extend(["--mem-frac", "0.85"]) - if "AWQ" in model: - other_args.extend(["--quantization", "awq"]) - elif "GPTQ" in model: - other_args.extend(["--quantization", "gptq"]) process = popen_launch_server( model, @@ -150,9 +134,7 @@ class TestNightlyGsm8KEval(unittest.TestCase): for model_group, is_fp8, is_tp2 in self.model_groups: for model in model_group: with self.subTest(model=model): - process = popen_launch_server_wrapper( - self.base_url, model, is_fp8, is_tp2 - ) + process = popen_launch_server_wrapper(self.base_url, model, is_tp2) args = SimpleNamespace( base_url=self.base_url, diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py deleted file mode 100644 index 2a1ea3b27..000000000 --- a/test/srt/test_nightly_human_eval.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import shutil -import signal -import subprocess -import unittest - -from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper - -from sglang.srt.utils import kill_process_tree -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, - DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, - DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1, - DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - is_in_ci, -) - - -class TestNightlyHumanEval(CustomTestCase): - @classmethod - def setUpClass(cls): - if is_in_ci(): - cls.model_groups = [([DEFAULT_MODEL_NAME_FOR_TEST], False, False)] - else: - cls.model_groups = [ - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), - ( - parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), - True, - False, - ), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), - ] - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = None - cls.eval_process = None - - @classmethod - def tearDownClass(cls): - if cls.process: - kill_process_tree(cls.process.pid) - if cls.eval_process: - kill_process_tree(cls.eval_process.pid) - - def run_evalplus(self, model): - print("Delete evalplus results") - shutil.rmtree("evalplus_results", ignore_errors=True) - cmd = [ - "evalplus.evaluate", - "--model", - model, - "--dataset", - "humaneval", - "--backend", - "openai", - "--base-url", - "http://localhost:6157/v1", - "--greedy", - ] - - try: - self.eval_process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - preexec_fn=os.setsid, - ) - - stdout, stderr = self.eval_process.communicate(timeout=600) - - if self.eval_process.returncode != 0: - print(f"Fail to human eval model={model} err={stderr}") - - print("=" * 42) - print(stdout) - print("=" * 42) - except subprocess.TimeoutExpired: - if self.eval_process: - os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM) - print(f"Timeout during evaluation for model={model}") - except Exception as e: - print(f"Error running evalplus for model={model} {str(e)}") - if self.eval_process: - os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM) - - def test_human_eval_all_models(self): - for model_group, is_fp8, is_tp2 in self.model_groups: - for model in model_group: - # NOTE: only Llama for now - if "Llama" in model: - with self.subTest(model=model): - self.process = popen_launch_server_wrapper( - self.base_url, model, is_fp8, is_tp2 - ) - self.run_evalplus(model) - self.tearDownClass() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_nightly_math_eval.py b/test/srt/test_nightly_math_eval.py deleted file mode 100644 index 20db12454..000000000 --- a/test/srt/test_nightly_math_eval.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - popen_launch_server, -) - - -class TestEvalAccuracyLarge(CustomTestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--log-level-http", "warning"], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_math(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="math", - num_examples=5000, - num_threads=1024, - ) - - metrics = run_eval(args) - self.assertGreaterEqual( - metrics["score"], 0.519 - 0.02 - ) # -2% to account for sampling variance - - -if __name__ == "__main__": - unittest.main()