fix: show failed models in nightly ci (#10986)
This commit is contained in:
@@ -66,9 +66,8 @@ class BenchmarkResult(BaseModel):
|
|||||||
def help_str() -> str:
|
def help_str() -> str:
|
||||||
return f"""
|
return f"""
|
||||||
Note: To view the traces through perfetto-ui, please:
|
Note: To view the traces through perfetto-ui, please:
|
||||||
1. use Google Chrome
|
1. open with Google Chrome
|
||||||
2. enable popup
|
2. allow popup
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def to_markdown_row(
|
def to_markdown_row(
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ from sglang.utils import is_in_ci
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define constants
|
# Define constants
|
||||||
LOAD_FORMAT_CHOICES = [
|
LOAD_FORMAT_CHOICES = [
|
||||||
"auto",
|
"auto",
|
||||||
|
|||||||
@@ -1518,31 +1518,45 @@ def check_evaluation_test_results(
|
|||||||
summary = " | model | status | score | score_threshold | \n"
|
summary = " | model | status | score | score_threshold | \n"
|
||||||
summary += "| ----- | ------ | ----- | --------------- | \n"
|
summary += "| ----- | ------ | ----- | --------------- | \n"
|
||||||
|
|
||||||
for model, accuracy, latency in results:
|
results_dict = {res[0]: (res[1], res[2]) for res in results}
|
||||||
accuracy_threshold = model_accuracy_thresholds.get(model)
|
|
||||||
if accuracy_threshold is None:
|
|
||||||
print(f"Warning: No threshold defined for model {model}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
|
||||||
latency_threshold = (
|
latency_threshold = (
|
||||||
model_latency_thresholds.get(model, None)
|
model_latency_thresholds.get(model)
|
||||||
if model_latency_thresholds
|
if model_latency_thresholds is not None
|
||||||
else 1e9
|
else 1e9
|
||||||
)
|
)
|
||||||
|
|
||||||
is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
|
if model in results_dict:
|
||||||
status_emoji = "✅" if is_success else "❌"
|
accuracy, latency = results_dict[model]
|
||||||
|
is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
|
||||||
|
status_emoji = "✅" if is_success else "❌"
|
||||||
|
|
||||||
if not is_success:
|
if not is_success:
|
||||||
failed_models.append(
|
if accuracy < accuracy_threshold:
|
||||||
f"\nScore Check Failed: {model}\n"
|
failed_models.append(
|
||||||
f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
|
f"\nScore Check Failed: {model}\n"
|
||||||
)
|
f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
|
||||||
|
)
|
||||||
|
if latency > latency_threshold:
|
||||||
|
failed_models.append(
|
||||||
|
f"\nLatency Check Failed: {model}\n"
|
||||||
|
f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
|
||||||
|
)
|
||||||
|
|
||||||
if model_latency_thresholds is not None:
|
if model_latency_thresholds is not None:
|
||||||
line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
|
line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
|
||||||
|
else:
|
||||||
|
line = (
|
||||||
|
f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
|
status_emoji = "❌"
|
||||||
|
failed_models.append(f"Model failed to launch or be evaluated: {model}")
|
||||||
|
if model_latency_thresholds is not None:
|
||||||
|
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
|
||||||
|
else:
|
||||||
|
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
|
||||||
|
|
||||||
summary += line
|
summary += line
|
||||||
|
|
||||||
@@ -1551,13 +1565,8 @@ def check_evaluation_test_results(
|
|||||||
if is_in_ci():
|
if is_in_ci():
|
||||||
write_github_step_summary(f"## {test_name}\n{summary}")
|
write_github_step_summary(f"## {test_name}\n{summary}")
|
||||||
|
|
||||||
some_model_failed_to_get_result = len(results) != (
|
if failed_models:
|
||||||
model_count or len(model_accuracy_thresholds)
|
print("Some models failed the evaluation.")
|
||||||
)
|
|
||||||
if some_model_failed_to_get_result:
|
|
||||||
print("Some model has failed to launch and be evaluated")
|
|
||||||
|
|
||||||
if failed_models or some_model_failed_to_get_result:
|
|
||||||
raise AssertionError("\n".join(failed_models))
|
raise AssertionError("\n".join(failed_models))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ MODEL_SCORE_THRESHOLDS = {
|
|||||||
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
|
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
|
||||||
"google/gemma-2-27b-it": 0.91,
|
"google/gemma-2-27b-it": 0.91,
|
||||||
"meta-llama/Llama-3.1-70B-Instruct": 0.95,
|
"meta-llama/Llama-3.1-70B-Instruct": 0.95,
|
||||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.62,
|
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.616,
|
||||||
"Qwen/Qwen2-57B-A14B-Instruct": 0.86,
|
"Qwen/Qwen2-57B-A14B-Instruct": 0.86,
|
||||||
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
|
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
|
||||||
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
|
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ MODEL_THRESHOLDS = {
|
|||||||
),
|
),
|
||||||
ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
|
ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
|
||||||
ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
|
ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
|
||||||
ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 14.5),
|
ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
|
||||||
ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3),
|
ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3),
|
||||||
ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
|
ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
|
||||||
ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
|
ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
|
||||||
|
|||||||
Reference in New Issue
Block a user