Improve stack trace of retry errors (#4845)
This commit is contained in:
@@ -35,6 +35,7 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
@@ -1766,3 +1767,32 @@ def parse_connector_type(url: str) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
def retry(
|
||||||
|
fn,
|
||||||
|
max_retry: int,
|
||||||
|
initial_delay: float = 2.0,
|
||||||
|
max_delay: float = 60.0,
|
||||||
|
should_retry: Callable[[Any], bool] = lambda e: True,
|
||||||
|
):
|
||||||
|
for try_index in itertools.count():
|
||||||
|
try:
|
||||||
|
return fn()
|
||||||
|
except Exception as e:
|
||||||
|
if try_index >= max_retry:
|
||||||
|
raise Exception(f"retry() exceed maximum number of retries.")
|
||||||
|
|
||||||
|
if not should_retry(e):
|
||||||
|
raise Exception(f"retry() observe errors that should not be retried.")
|
||||||
|
|
||||||
|
delay = min(initial_delay * (2**try_index), max_delay) * (
|
||||||
|
0.75 + 0.25 * random.random()
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
|
||||||
|
)
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
time.sleep(delay)
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ from sglang.bench_serving import run_benchmark
|
|||||||
from sglang.global_config import global_config
|
from sglang.global_config import global_config
|
||||||
from sglang.lang.backend.openai import OpenAI
|
from sglang.lang.backend.openai import OpenAI
|
||||||
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
||||||
from sglang.srt.utils import get_bool_env_var, kill_process_tree
|
from sglang.srt.utils import get_bool_env_var, kill_process_tree, retry
|
||||||
from sglang.test.run_eval import run_eval
|
from sglang.test.run_eval import run_eval
|
||||||
from sglang.utils import get_exception_traceback
|
from sglang.utils import get_exception_traceback
|
||||||
|
|
||||||
@@ -1010,26 +1010,10 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
|||||||
|
|
||||||
class CustomTestCase(unittest.TestCase):
|
class CustomTestCase(unittest.TestCase):
|
||||||
def _callTestMethod(self, method):
|
def _callTestMethod(self, method):
|
||||||
_retry_execution(
|
max_retry = int(
|
||||||
|
os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0")
|
||||||
|
)
|
||||||
|
retry(
|
||||||
lambda: super(CustomTestCase, self)._callTestMethod(method),
|
lambda: super(CustomTestCase, self)._callTestMethod(method),
|
||||||
max_retry=_get_max_retry(),
|
max_retry=max_retry,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_max_retry():
|
|
||||||
return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
|
|
||||||
|
|
||||||
|
|
||||||
def _retry_execution(fn, max_retry: int):
|
|
||||||
if max_retry == 0:
|
|
||||||
fn()
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
fn()
|
|
||||||
except Exception as e:
|
|
||||||
print(
|
|
||||||
f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
|
|
||||||
)
|
|
||||||
traceback.print_exc()
|
|
||||||
_retry_execution(fn, max_retry=max_retry - 1)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user