OpenSonnet-Lite-MAX/.eval_results/gsm8k.json

{
  "results": {
    "gsm8k": {
      "alias": "gsm8k",
      "exact_match,strict-match": 0.8521607278241091,
      "exact_match_stderr,strict-match": 0.00977682767914391,
      "exact_match,flexible-extract": 0.8529188779378317,
      "exact_match_stderr,flexible-extract": 0.009756063660359896
    }
  },
  "group_subtasks": {
    "gsm8k": []
  },
  "configs": {
    "gsm8k": {
      "task": "gsm8k",
      "tag": [
        "math_word_problems"
      ],
      "dataset_path": "openai/gsm8k",
      "dataset_name": "main",
      "training_split": "train",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Question: {{question}}\nAnswer:",
      "doc_to_target": "{{answer}}",
      "unsafe_code": false,
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "fewshot_config": {
        "sampler": "default",
        "split": "train",
        "process_docs": null,
        "fewshot_indices": null,
        "samples": null,
        "doc_to_text": "Question: {{question}}\nAnswer:",
        "doc_to_choice": null,
        "doc_to_target": "{{answer}}",
        "gen_prefix": null,
        "fewshot_delimiter": "\n\n",
        "target_delimiter": " "
      },
      "num_fewshot": 8,
      "metric_list": [
        {
          "metric": "exact_match",
          "aggregation": "mean",
          "higher_is_better": true,
          "ignore_case": true,
          "ignore_punctuation": false,
          "regexes_to_ignore": [
            ",",
            "\\$",
            "(?s).*#### ",
            "\\.$"
          ]
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "Question:",
          "</s>",
          "<|im_end|>"
        ],
        "do_sample": false,
        "temperature": 0.6,
        "max_gen_toks": 131072,
        "max_tokens": 131072,
        "top_p": 0.95,
        "top_k": 20,
        "min_p": 0,
        "repeat_penalty": 1.0,
        "presence_penalty": 1.0,
        "num_ctx": 262144
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "strict-match",
          "filter": [
            {
              "function": "regex",
              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
            },
            {
              "function": "take_first"
            }
          ]
        },
        {
          "name": "flexible-extract",
          "filter": [
            {
              "function": "regex",
              "group_select": -1,
              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 3.0
      }
    }
  },
  "versions": {
    "gsm8k": 3.0
  },
  "n-shot": {
    "gsm8k": 8
  },
  "higher_is_better": {
    "gsm8k": {
      "exact_match": true
    }
  },
  "n-samples": {
    "gsm8k": {
      "original": 1319,
      "effective": 1319
    }
  },
  "config": {
    "model": "LocalChatCompletion",
    "model_args": null,
    "batch_size": 1,
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": {
      "max_gen_toks": 131072,
      "max_tokens": 131072,
      "temperature": 0.6,
      "top_p": 0.95,
      "top_k": 20,
      "min_p": 0,
      "repeat_penalty": 1.0,
      "presence_penalty": 1.0,
      "num_ctx": 262144
    },
    "random_seed": 0,
    "numpy_seed": 1234,
    "torch_seed": 1234,
    "fewshot_seed": 1234
  },
  "git_hash": null,
  "date": 1778390867.1640942,
  "pretty_env_info": "PyTorch version: 2.6.0+cpu\nIs debug build: False\nCUDA used to build PyTorch: Could not collect\nROCM used to build PyTorch: N/A\n\nOS: Debian GNU/Linux 12 (bookworm) (x86_64)\nGCC version: (Debian 12.2.0-14+deb12u1) 12.2.0\nClang version: Could not collect\nCMake version: version 3.25.1\nLibc version: glibc-2.36\n\nPython version: 3.11.5 (main, Aug 26 2023, 07:22:50) [Clang 16.0.3 ] (64-bit runtime)\nPython platform: Linux-4.4.0-x86_64-with-glibc2.36\nIs CUDA available: False\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: N/A\nGPU models and configuration: GPU 0: NVIDIA B200\nNvidia driver version: 580.95.05\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_tensor_ir.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ext.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.22.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.22.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:        x86_64\nCPU op-mode(s):      32-bit, 64-bit\nAddress sizes:       46 bits physical, 48 bits virtual\nByte Order:          Little Endian\nCPU(s):              17\nOn-line CPU(s) list: 0-16\nVendor ID:           AuthenticAMD\nModel name:          unknown\nCPU family:          191\nModel:               2\nThread(s) per core:  1\nCore(s) per socket:  17\nSocket(s):           1\nStepping:            unknown\nBogoMIPS:            5035.66\nFlags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid movdiri movdir64b fsrm avx512_vp2intersect flush_l1d\nVirtualization:      AMD-V\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.6.0+cpu\n[conda] Could not collect",
  "transformers_version": "5.5.1",
  "lm_eval_version": "0.4.11",
  "upper_git_hash": null
}