Disable flaky eagle tests (#5753)

This commit is contained in:
Lianmin Zheng
2025-04-25 15:54:39 -07:00
committed by GitHub
parent 5641a09458
commit 21514ff5bd
5 changed files with 7 additions and 47 deletions

View File

@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai. For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
## Acknowledgment and Citation ## Acknowledgment
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful. We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

View File

@@ -279,9 +279,9 @@ class CudaGraphRunner:
f"Capture cuda graph failed: {e}\n" f"Capture cuda graph failed: {e}\n"
"Possible solutions:\n" "Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n" "3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph\n" "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
) )

View File

@@ -955,12 +955,6 @@ class ModelRunner:
return return
if self.server_args.disable_cuda_graph: if self.server_args.disable_cuda_graph:
logger.warning(
"\n\nCUDA Graph is DISABLED.\n"
"This will cause significant performance degradation.\n"
"CUDA Graph should almost never be disabled in most usage scenarios.\n"
"If you encounter OOM issues, please try setting --mem-fraction-static to a lower value (such as 0.8 or 0.7) instead of disabling CUDA Graph.\n"
)
return return
tic = time.time() tic = time.time()

View File

@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
f"Capture cuda graph failed: {e}\n" f"Capture cuda graph failed: {e}\n"
"Possible solutions:\n" "Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. disable torch compile by not using --enable-torch-compile\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. specify --dtype to the same dtype (e.g. bfloat16)\n" "3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph\n" "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
) )

View File

@@ -1,5 +1,4 @@
import json import json
import multiprocessing as mp
import os import os
import random import random
import threading import threading
@@ -8,7 +7,6 @@ import unittest
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial from functools import partial
from types import SimpleNamespace from types import SimpleNamespace
from typing import List, Optional
import numpy as np import numpy as np
import requests import requests
@@ -18,7 +16,6 @@ import sglang as sgl
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
@@ -541,36 +538,5 @@ class TestEAGLEServerTriton(TestEAGLEServer):
) )
class TestEAGLEServerPageSize(TestEAGLEServer):
@classmethod
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--speculative-algorithm",
"EAGLE",
"--speculative-draft-model-path",
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
"--speculative-num-steps",
5,
"--speculative-eagle-topk",
1,
"--speculative-num-draft-tokens",
6,
"--mem-fraction-static",
0.7,
"--chunked-prefill-size",
128,
"--max-running-requests",
8,
"--page-size",
8,
],
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()