diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 7820f6825..2e379c111 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -26,8 +26,8 @@ MODEL_SCORE_THRESHOLDS = { "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, "google/gemma-2-27b-it": 0.92, "meta-llama/Llama-3.1-70B-Instruct": 0.95, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, - "Qwen/Qwen2-57B-A14B-Instruct": 0.88, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63, + "Qwen/Qwen2-57B-A14B-Instruct": 0.87, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,