Disable flaky eagle tests (#5753)
This commit is contained in:
@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
|
|||||||
|
|
||||||
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
||||||
|
|
||||||
## Acknowledgment and Citation
|
## Acknowledgment
|
||||||
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
||||||
|
|||||||
@@ -279,9 +279,9 @@ class CudaGraphRunner:
|
|||||||
f"Capture cuda graph failed: {e}\n"
|
f"Capture cuda graph failed: {e}\n"
|
||||||
"Possible solutions:\n"
|
"Possible solutions:\n"
|
||||||
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
||||||
"2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n"
|
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
||||||
"3. disable torch compile by not using --enable-torch-compile\n"
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
||||||
"4. disable cuda graph by --disable-cuda-graph\n"
|
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
||||||
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -955,12 +955,6 @@ class ModelRunner:
|
|||||||
return
|
return
|
||||||
|
|
||||||
if self.server_args.disable_cuda_graph:
|
if self.server_args.disable_cuda_graph:
|
||||||
logger.warning(
|
|
||||||
"\n\nCUDA Graph is DISABLED.\n"
|
|
||||||
"This will cause significant performance degradation.\n"
|
|
||||||
"CUDA Graph should almost never be disabled in most usage scenarios.\n"
|
|
||||||
"If you encounter OOM issues, please try setting --mem-fraction-static to a lower value (such as 0.8 or 0.7) instead of disabling CUDA Graph.\n"
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
|
|||||||
@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
|
|||||||
f"Capture cuda graph failed: {e}\n"
|
f"Capture cuda graph failed: {e}\n"
|
||||||
"Possible solutions:\n"
|
"Possible solutions:\n"
|
||||||
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
||||||
"2. disable torch compile by not using --enable-torch-compile\n"
|
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
||||||
"3. specify --dtype to the same dtype (e.g. bfloat16)\n"
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
||||||
"4. disable cuda graph by --disable-cuda-graph\n"
|
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
||||||
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import json
|
import json
|
||||||
import multiprocessing as mp
|
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import threading
|
import threading
|
||||||
@@ -8,7 +7,6 @@ import unittest
|
|||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
@@ -18,7 +16,6 @@ import sglang as sgl
|
|||||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||||
from sglang.srt.utils import kill_process_tree
|
from sglang.srt.utils import kill_process_tree
|
||||||
from sglang.test.few_shot_gsm8k import run_eval
|
from sglang.test.few_shot_gsm8k import run_eval
|
||||||
from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner
|
|
||||||
from sglang.test.test_utils import (
|
from sglang.test.test_utils import (
|
||||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
||||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
||||||
@@ -541,36 +538,5 @@ class TestEAGLEServerTriton(TestEAGLEServer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestEAGLEServerPageSize(TestEAGLEServer):
|
|
||||||
@classmethod
|
|
||||||
def setUpClass(cls):
|
|
||||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
|
||||||
cls.process = popen_launch_server(
|
|
||||||
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
|
|
||||||
cls.base_url,
|
|
||||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
|
||||||
other_args=[
|
|
||||||
"--speculative-algorithm",
|
|
||||||
"EAGLE",
|
|
||||||
"--speculative-draft-model-path",
|
|
||||||
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
|
|
||||||
"--speculative-num-steps",
|
|
||||||
5,
|
|
||||||
"--speculative-eagle-topk",
|
|
||||||
1,
|
|
||||||
"--speculative-num-draft-tokens",
|
|
||||||
6,
|
|
||||||
"--mem-fraction-static",
|
|
||||||
0.7,
|
|
||||||
"--chunked-prefill-size",
|
|
||||||
128,
|
|
||||||
"--max-running-requests",
|
|
||||||
8,
|
|
||||||
"--page-size",
|
|
||||||
8,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user