Improve stack trace of retry errors (#4845)

This commit is contained in:
fzyzcjy
2025-03-29 23:21:31 +08:00
committed by GitHub
parent b1cfb4e972
commit 8690c40bb0
2 changed files with 36 additions and 22 deletions

View File

@@ -35,6 +35,7 @@ import sys
import tempfile import tempfile
import threading import threading
import time import time
import traceback
import warnings import warnings
from contextlib import contextmanager from contextlib import contextmanager
from functools import lru_cache from functools import lru_cache
@@ -1766,3 +1767,32 @@ def parse_connector_type(url: str) -> str:
return "" return ""
return m.group(1) return m.group(1)
def retry(
fn,
max_retry: int,
initial_delay: float = 2.0,
max_delay: float = 60.0,
should_retry: Callable[[Any], bool] = lambda e: True,
):
for try_index in itertools.count():
try:
return fn()
except Exception as e:
if try_index >= max_retry:
raise Exception(f"retry() exceed maximum number of retries.")
if not should_retry(e):
raise Exception(f"retry() observe errors that should not be retried.")
delay = min(initial_delay * (2**try_index), max_delay) * (
0.75 + 0.25 * random.random()
)
logger.warning(
f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
)
traceback.print_exc()
time.sleep(delay)

View File

@@ -25,7 +25,7 @@ from sglang.bench_serving import run_benchmark
from sglang.global_config import global_config from sglang.global_config import global_config
from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.openai import OpenAI
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.srt.utils import get_bool_env_var, kill_process_tree from sglang.srt.utils import get_bool_env_var, kill_process_tree, retry
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
@@ -1010,26 +1010,10 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
class CustomTestCase(unittest.TestCase): class CustomTestCase(unittest.TestCase):
def _callTestMethod(self, method): def _callTestMethod(self, method):
_retry_execution( max_retry = int(
os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0")
)
retry(
lambda: super(CustomTestCase, self)._callTestMethod(method), lambda: super(CustomTestCase, self)._callTestMethod(method),
max_retry=_get_max_retry(), max_retry=max_retry,
) )
def _get_max_retry():
return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
def _retry_execution(fn, max_retry: int):
if max_retry == 0:
fn()
return
try:
fn()
except Exception as e:
print(
f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
)
traceback.print_exc()
_retry_execution(fn, max_retry=max_retry - 1)