From aa957102a9ac1f7c528a46f254f44fb37e872b88 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 10 Mar 2025 01:24:22 -0700 Subject: [PATCH] Simplify tests & Fix trtllm custom allreduce registration (#4252) --- .github/workflows/pr-test.yml | 2 +- python/sglang/srt/_custom_ops.py | 6 +- .../layers/attention/flashinfer_backend.py | 2 +- python/sglang/srt/model_loader/loader.py | 3 +- sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu | 5 +- sgl-kernel/csrc/torch_extension.cc | 7 +- sgl-kernel/tests/test_rotary_embedding.py | 2 - test/srt/run_suite.py | 22 +++--- test/srt/test_bench_one_batch.py | 7 +- ...est_eval_accuracy_large_chunked_prefill.py | 68 ----------------- ...al_accuracy_large_mixed_chunked_prefill.py | 74 ------------------- test/srt/test_eval_accuracy_mini.py | 42 ----------- test/srt/test_gptqmodel_dynamic.py | 1 + 13 files changed, 30 insertions(+), 211 deletions(-) delete mode 100644 test/srt/test_eval_accuracy_large_chunked_prefill.py delete mode 100644 test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py delete mode 100644 test/srt/test_eval_accuracy_mini.py diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 3b33b319d..265e1374b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -266,7 +266,7 @@ jobs: cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 10 diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index d06765c3a..0584dc80f 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -6,10 +6,12 @@ from typing import List, Tuple import torch import torch.library -from sglang.srt.utils import is_hip, is_hpu +from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu logger = logging.getLogger(__name__) -use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True) +use_vllm_custom_allreduce = get_bool_env_var( + "USE_VLLM_CUSTOM_ALLREDUCE", default="true" +) if not is_hpu(): # ROCm does not use vllm custom allreduce diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index af47f2ce2..791cbeec0 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput -from sglang.srt.utils import is_flashinfer_available +from sglang.srt.utils import get_bool_env_var, is_flashinfer_available if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py index eff4aa5f3..c241fd9d6 100644 --- a/python/sglang/srt/model_loader/loader.py +++ b/python/sglang/srt/model_loader/loader.py @@ -48,6 +48,7 @@ from sglang.srt.model_loader.weight_utils import ( safetensors_weights_iterator, ) from sglang.srt.utils import ( + get_bool_env_var, get_device_capability, is_pin_memory_available, set_weight_attrs, @@ -197,7 +198,7 @@ class DefaultModelLoader(BaseModelLoader): Returns the path to the downloaded model, or None if the model is not downloaded from ModelScope.""" - if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True": + if get_bool_env_var("SGLANG_USE_MODELSCOPE"): # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. # pylint: disable=C. diff --git a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu index d0a80c7bf..2add0826f 100644 --- a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu +++ b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu @@ -100,7 +100,6 @@ void cublas_grouped_gemm( check_device_dtype(out_dtype, inputs); check_device_dtype(out_dtype, weights); check_device_dtype(out_dtype, outputs); - cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF); // Weights should be transposed to (n, k) of column major std::vector transa_array(group_count, CUBLAS_OP_T); @@ -132,7 +131,6 @@ void cublas_grouped_gemm( std::vector b_array = get_tensor_ptrs(inputs); std::vector c_array = get_tensor_ptrs(outputs); - auto handle = reinterpret_cast(cublas_handle); auto stream = reinterpret_cast(cuda_stream); // Should allocate tensors for storage of pointers @@ -141,6 +139,9 @@ void cublas_grouped_gemm( torch::Tensor d_c = create_ptr_pointer(c_array, stream); #if defined CUDA_VERSION && CUDA_VERSION >= 12050 + auto handle = reinterpret_cast(cublas_handle); + cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF); + auto status = cublasGemmGroupedBatchedEx( handle, transa_array.data(), diff --git a/sgl-kernel/csrc/torch_extension.cc b/sgl-kernel/csrc/torch_extension.cc index 1304915bf..d8bd89917 100644 --- a/sgl-kernel/csrc/torch_extension.cc +++ b/sgl-kernel/csrc/torch_extension.cc @@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) { m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()"); m.impl("all_reduce", torch::kCUDA, &all_reduce); - m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])"); - m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta); - - m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()"); - m.impl("register_graph_buffers", torch::kCUDA, ®ister_graph_buffers); + m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); + m.def("register_graph_buffers", ®ister_graph_buffers); /* * From csrc/attention diff --git a/sgl-kernel/tests/test_rotary_embedding.py b/sgl-kernel/tests/test_rotary_embedding.py index b7a141404..fa937a604 100644 --- a/sgl-kernel/tests/test_rotary_embedding.py +++ b/sgl-kernel/tests/test_rotary_embedding.py @@ -1,9 +1,7 @@ -import math from typing import Any, Dict, List, Optional, Tuple, Union import pytest import torch -import torch.nn as nn from sgl_kernel import apply_rope_with_cos_sin_cache_inplace diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b9e36e232..590c46136 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -20,30 +20,33 @@ suites = { TestFile("models/test_generation_models.py", 103), TestFile("models/test_qwen_models.py", 82), TestFile("models/test_reward_models.py", 83), - TestFile("test_gptqmodel_dynamic.py", 72), TestFile("models/test_gme_qwen_models.py", 45), TestFile("test_abort.py", 51), + TestFile("test_block_int8.py", 22), TestFile("test_chunked_prefill.py", 336), - TestFile("test_custom_allreduce.py", 1), - TestFile("test_double_sparsity.py", 50), TestFile("test_eagle_infer.py", 447), + TestFile("test_ebnf_constrained.py"), + TestFile("test_fp8_kernel.py", 2), TestFile("test_embedding_openai_server.py", 36), - TestFile("test_eval_accuracy_mini.py", 63), TestFile("test_gguf.py", 78), + TestFile("test_gptqmodel_dynamic.py", 72), + TestFile("test_hidden_states.py", 55), + TestFile("test_int8_kernel.py", 1), TestFile("test_input_embeddings.py", 38), + TestFile("test_json_constrained.py", 98), + TestFile("test_large_max_new_tokens.py", 41), + TestFile("test_metrics.py", 32), TestFile("test_mla.py", 92), TestFile("test_mla_deepseek_v3.py", 221), TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_fp8.py", 93), - TestFile("test_json_constrained.py", 98), - TestFile("test_large_max_new_tokens.py", 41), - TestFile("test_metrics.py", 32), TestFile("test_no_chunked_prefill.py", 126), TestFile("test_no_overlap_scheduler.py", 262), TestFile("test_openai_server.py", 124), TestFile("test_penalty.py", 41), TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_radix_attention.py", 167), + TestFile("test_reasoning_content.py", 89), TestFile("test_regex_constrained.py", 64), TestFile("test_release_memory_occupation.py", 44), TestFile("test_request_length_validation.py", 31), @@ -58,7 +61,6 @@ suites = { TestFile("test_torchao.py", 70), TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_attention_backend.py", 134), - TestFile("test_hidden_states.py", 55), TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_vertex_endpoint.py", 31), @@ -66,10 +68,6 @@ suites = { TestFile("test_vision_llm.py", 18.4), TestFile("test_vision_openai_server.py", 344), TestFile("test_w8a8_quantization.py", 46), - TestFile("test_fp8_kernel.py", 2), - TestFile("test_block_int8.py", 22), - TestFile("test_int8_kernel.py", 1), - TestFile("test_reasoning_content.py", 89), ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index f4140b89f..e015da6a1 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -3,6 +3,7 @@ import unittest from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST, + get_bool_env_var, is_in_ci, run_bench_one_batch, write_github_step_summary, @@ -27,9 +28,13 @@ class TestBenchOneBatch(unittest.TestCase): DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"] ) + use_vllm_custom_allreduce = get_bool_env_var( + "USE_VLLM_CUSTOM_ALLREDUCE", default="true" + ) + if is_in_ci(): write_github_step_summary( - f"### test_moe_tp2_bs1\n" + f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n" f"output_throughput : {output_throughput:.2f} token/s\n" ) self.assertGreater(output_throughput, 124) diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py deleted file mode 100644 index c8ce5cff2..000000000 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=3000, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.705, f"{metrics}" - - def test_human_eval(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="humaneval", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.64, f"{metrics}" - - def test_mgsm_en(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.84, f"{metrics}" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py deleted file mode 100644 index 3bc115874..000000000 --- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py +++ /dev/null @@ -1,74 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--log-level-http", - "warning", - "--chunked-prefill-size", - "256", - "--enable-mixed-chunk", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=3000, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.705, f"{metrics}" - - def test_human_eval(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="humaneval", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.64, f"{metrics}" - - def test_mgsm_en(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.84, f"{metrics}" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py deleted file mode 100644 index a008c3869..000000000 --- a/test/srt/test_eval_accuracy_mini.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestEvalAccuracyMini(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=64, - num_threads=32, - temperature=0.1, - ) - - metrics = run_eval(args) - self.assertGreaterEqual(metrics["score"], 0.65) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py index 92e17a8e4..c9145fe6f 100644 --- a/test/srt/test_gptqmodel_dynamic.py +++ b/test/srt/test_gptqmodel_dynamic.py @@ -129,6 +129,7 @@ class TestGPTQModelDynamic(unittest.TestCase): "text": "The capital of France is", "sampling_params": { "max_new_tokens": max_new_tokens, + "temperature": 0.001, }, }, )