[CI] Fix disaggregation failure tolerance CI (#9378)
Signed-off-by: Shangming Cai <csmthu@gmail.com>
This commit is contained in:
@@ -323,9 +323,22 @@ class TestDisaggregationMooncakeFailure(CustomTestCase):
|
||||
host=f"http://{self.base_host}",
|
||||
port=int(self.lb_port),
|
||||
)
|
||||
metrics = run_eval_few_shot_gsm8k(args)
|
||||
print(f"Evaluation metrics: {metrics}")
|
||||
|
||||
# Expect lots of failure but the server cannot crash
|
||||
try:
|
||||
metrics = run_eval_few_shot_gsm8k(args)
|
||||
print(f"Evaluation metrics: {metrics}")
|
||||
except Exception as e:
|
||||
print(f"Test encountered expected errors: {e}")
|
||||
# Check if servers are still healthy
|
||||
try:
|
||||
response = requests.get(self.prefill_url + "/health_generate")
|
||||
assert response.status_code == 200
|
||||
response = requests.get(self.decode_url + "/health_generate")
|
||||
assert response.status_code == 200
|
||||
except Exception as health_check_error:
|
||||
# If health check fails, re-raise the original exception
|
||||
raise e from health_check_error
|
||||
|
||||
|
||||
class TestDisaggregationMooncakeSpec(CustomTestCase):
|
||||
|
||||
Reference in New Issue
Block a user