From d4f0fc211c7ab43f6a1300cffb45d372d13bc96d Mon Sep 17 00:00:00 2001 From: ai-modelscope Date: Fri, 28 Feb 2025 11:41:08 +0800 Subject: [PATCH] Adding Evaluation Results (#1) - Adding Evaluation Results (062c0ffd4e50b328fdcc3bb061eafad12b3e51f2) --- README.md | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 95d2e50..b7ac1e3 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,105 @@ tags: - trl - coder - 7B +model-index: +- name: Viper-Coder-HybridMini-v1.3 + results: + - task: + type: text-generation + name: Text Generation + dataset: + name: IFEval (0-Shot) + type: wis-k/instruction-following-eval + split: train + args: + num_few_shot: 0 + metrics: + - type: inst_level_strict_acc and prompt_level_strict_acc + value: 61.04 + name: averaged accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=prithivMLmods%2FViper-Coder-HybridMini-v1.3 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: BBH (3-Shot) + type: SaylorTwift/bbh + split: test + args: + num_few_shot: 3 + metrics: + - type: acc_norm + value: 33.67 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=prithivMLmods%2FViper-Coder-HybridMini-v1.3 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MATH Lvl 5 (4-Shot) + type: lighteval/MATH-Hard + split: test + args: + num_few_shot: 4 + metrics: + - type: exact_match + value: 46.3 + name: exact match + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=prithivMLmods%2FViper-Coder-HybridMini-v1.3 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: GPQA (0-shot) + type: Idavidrein/gpqa + split: train + args: + num_few_shot: 0 + metrics: + - type: acc_norm + value: 8.95 + name: acc_norm + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=prithivMLmods%2FViper-Coder-HybridMini-v1.3 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MuSR (0-shot) + type: TAUR-Lab/MuSR + args: + num_few_shot: 0 + metrics: + - type: acc_norm + value: 15.61 + name: acc_norm + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=prithivMLmods%2FViper-Coder-HybridMini-v1.3 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MMLU-PRO (5-shot) + type: TIGER-Lab/MMLU-Pro + config: main + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 37.24 + name: accuracy + source: + url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=prithivMLmods%2FViper-Coder-HybridMini-v1.3 + name: Open LLM Leaderboard --- ![7.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/qJvY7En72kGCdlpUutxBM.png) @@ -78,4 +177,18 @@ print(response) 2. **Language-Specific Variability**: Performance may vary across different programming languages. 3. **Possible Error Propagation**: Extended text outputs might introduce logical inconsistencies. 4. **Limited Real-World Awareness**: The model does not have access to real-time internet updates. -5. **Prompt Sensitivity**: Performance depends on how well the prompt is structured. \ No newline at end of file +5. **Prompt Sensitivity**: Performance depends on how well the prompt is structured. +# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) +Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/prithivMLmods__Viper-Coder-HybridMini-v1.3-details)! +Summarized results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q=prithivMLmods%2FViper-Coder-HybridMini-v1.3&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc)! + +| Metric |Value (%)| +|-------------------|--------:| +|**Average** | 33.80| +|IFEval (0-Shot) | 61.04| +|BBH (3-Shot) | 33.67| +|MATH Lvl 5 (4-Shot)| 46.30| +|GPQA (0-shot) | 8.95| +|MuSR (0-shot) | 15.61| +|MMLU-PRO (5-shot) | 37.24| +