From d8ed60f254d0f235c0c76924e822257bf68a77e1 Mon Sep 17 00:00:00 2001 From: Shangming Cai Date: Wed, 20 Aug 2025 14:31:08 +0800 Subject: [PATCH] [CI] Fix disaggregation failure tolerance CI (#9378) Signed-off-by: Shangming Cai --- test/srt/test_disaggregation.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/test/srt/test_disaggregation.py b/test/srt/test_disaggregation.py index b325314a2..68848aade 100644 --- a/test/srt/test_disaggregation.py +++ b/test/srt/test_disaggregation.py @@ -323,9 +323,22 @@ class TestDisaggregationMooncakeFailure(CustomTestCase): host=f"http://{self.base_host}", port=int(self.lb_port), ) - metrics = run_eval_few_shot_gsm8k(args) - print(f"Evaluation metrics: {metrics}") + # Expect lots of failure but the server cannot crash + try: + metrics = run_eval_few_shot_gsm8k(args) + print(f"Evaluation metrics: {metrics}") + except Exception as e: + print(f"Test encountered expected errors: {e}") + # Check if servers are still healthy + try: + response = requests.get(self.prefill_url + "/health_generate") + assert response.status_code == 200 + response = requests.get(self.decode_url + "/health_generate") + assert response.status_code == 200 + except Exception as health_check_error: + # If health check fails, re-raise the original exception + raise e from health_check_error class TestDisaggregationMooncakeSpec(CustomTestCase):