{ "checkpoint": "checkpoint-18779", "id": { "answer_accuracy": 0.9630347222222222, "avg_response_len": 201.21272916666666, "avg_loss": 6.54602900314331, "count": 1152000, "validation_metrics": {}, "pass_at_k": { "pass@1": 0.9630347222222222, "pass@2": 0.9784445264654293, "pass@4": 0.9859757670187062, "pass@8": 0.9905464808595384, "pass@16": 0.9936741909013082, "pass@32": 0.9958726768097093, "pass@64": 0.9973773311067934, "pass@128": 0.9983333333333333 }, "per_op_pass_at_k": { "10": { "pass@1": 0.89534375, "pass@2": 0.9316516978346466, "pass@4": 0.9502444778777662, "pass@8": 0.9628981587488723, "pass@16": 0.9732652153333301, "pass@32": 0.9817272204905513, "pass@64": 0.9875005300252477, "pass@128": 0.991 }, "2": { "pass@1": 0.9994453125, "pass@2": 0.9998784448818898, "pass@4": 0.999987270247469, "pass@8": 0.9999998760353234, "pass@16": 0.9999999999955357, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9896953125, "pass@2": 0.9956350885826775, "pass@4": 0.998029471128609, "pass@8": 0.9990296020970205, "pass@16": 0.9994722941030695, "pass@32": 0.999768854949753, "pass@64": 0.9999711811022791, "pass@128": 1.0 }, "4": { "pass@1": 0.9968125, "pass@2": 0.9986126968503938, "pass@4": 0.9994299414135731, "pass@8": 0.9998706652443888, "pass@16": 0.999993190320664, "pass@32": 0.9999999900668188, "pass@64": 0.9999999999999999, "pass@128": 1.0 }, "5": { "pass@1": 0.95453125, "pass@2": 0.9673741387795279, "pass@4": 0.9768552553430826, "pass@8": 0.9845504029004547, "pass@16": 0.9905860749032567, "pass@32": 0.9947039335217139, "pass@64": 0.9972231304095797, "pass@128": 0.999 }, "6": { "pass@1": 0.9755546875, "pass@2": 0.9871010088582685, "pass@4": 0.9931382349081366, "pass@8": 0.9961125829051054, "pass@16": 0.9972334916240602, "pass@32": 0.9976868579347262, "pass@64": 0.998251968426333, "pass@128": 0.999 }, "7": { "pass@1": 0.9665546875, "pass@2": 0.980099409448819, "pass@4": 0.9868899991563556, "pass@8": 0.9912036930431989, "pass@16": 0.9940644245074017, "pass@32": 0.9960585379187011, "pass@64": 0.9974804599031311, "pass@128": 0.998 }, "8": { "pass@1": 0.95003125, "pass@2": 0.9737233021653545, "pass@4": 0.984498448537683, "pass@8": 0.9902565507718185, "pass@16": 0.9939245919129133, "pass@32": 0.996586911336722, "pass@64": 0.998538493317924, "pass@128": 1.0 }, "9": { "pass@1": 0.93934375, "pass@2": 0.9719249507874034, "pass@4": 0.98470880455568, "pass@8": 0.990996795989665, "pass@16": 0.9945284354115372, "pass@32": 0.9963217850684021, "pass@64": 0.9974302167766392, "pass@128": 0.998 } }, "per_op_accuracy": { "10": 0.89534375, "2": 0.9994453125, "3": 0.9896953125, "4": 0.9968125, "5": 0.95453125, "6": 0.9755546875, "7": 0.9665546875, "8": 0.95003125, "9": 0.93934375 }, "per_op_avg_response_len": { "10": 279.685734375, "2": 116.979234375, "3": 147.114546875, "4": 160.9594765625, "5": 176.0514453125, "6": 200.8557890625, "7": 225.0676796875, "8": 244.2956640625, "9": 259.9049921875 }, "per_op_avg_loss": { "10": 0.13990753173828124, "2": 0.15822406005859374, "3": 0.15319696044921874, "4": 0.1460938720703125, "5": 0.1487813720703125, "6": 0.14940606689453126, "7": 0.14932769775390625, "8": 0.14807330322265624, "9": 0.1509442138671875 }, "per_op_length": { "10": 279.685734375, "2": 116.979234375, "3": 147.114546875, "4": 160.9594765625, "5": 176.0514453125, "6": 200.8557890625, "7": 225.0676796875, "8": 244.2956640625, "9": 259.9049921875 }, "per_op_loss": { "10": 0.13990753173828124, "2": 0.15822406005859374, "3": 0.15319696044921874, "4": 0.1460938720703125, "5": 0.1487813720703125, "6": 0.14940606689453126, "7": 0.14932769775390625, "8": 0.14807330322265624, "9": 0.1509442138671875 }, "per_template": { "crazy_zootopia": { "count": 379008, "correct": 364061, "answer_accuracy": 0.960562837723742, "avg_response_len": 191.88788099459643, "resp_tokens_sum": 72727042, "pass_at_k": { "pass@1": 0.960562837723742, "pass@2": 0.9772055171959906, "pass@4": 0.985136322815272, "pass@8": 0.9899132907085476, "pass@16": 0.9932574318987277, "pass@32": 0.9956698086202027, "pass@64": 0.9972163254747021, "pass@128": 0.9979736575481256 }, "per_op_pass_at_k": { "10": { "pass@1": 0.8994976032448377, "pass@2": 0.9366747090795067, "pass@4": 0.954640098361598, "pass@8": 0.9671935395338649, "pass@16": 0.9780409476989385, "pass@32": 0.9873672705205916, "pass@64": 0.9931124421272916, "pass@128": 0.9941002949852508 }, "2": { "pass@1": 0.9997322819314641, "pass@2": 0.9999915679348491, "pass@4": 0.9999999897793149, "pass@8": 1.0, "pass@16": 1.0, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9875525611620795, "pass@2": 0.9937483445303281, "pass@4": 0.9960841566249173, "pass@8": 0.9973945614382809, "pass@16": 0.9984052318072489, "pass@32": 0.9992931754002767, "pass@64": 0.9999118688143109, "pass@128": 1.0 }, "4": { "pass@1": 0.996337890625, "pass@2": 0.9982602577509845, "pass@4": 0.9992694307742784, "pass@8": 0.9998455780548049, "pass@16": 0.9999940908757206, "pass@32": 0.9999999967265157, "pass@64": 1.0, "pass@128": 1.0 }, "5": { "pass@1": 0.9481150793650793, "pass@2": 0.9641533089613797, "pass@4": 0.9757381440415184, "pass@8": 0.9837977641761237, "pass@16": 0.9891567790393069, "pass@32": 0.9924331098404072, "pass@64": 0.9948057827917357, "pass@128": 0.9968253968253968 }, "6": { "pass@1": 0.9665746631736527, "pass@2": 0.9810815131547932, "pass@4": 0.9889092237347574, "pass@8": 0.992817054998713, "pass@16": 0.9942118482404367, "pass@32": 0.9947566055669427, "pass@64": 0.9955089818359227, "pass@128": 0.9970059880239521 }, "7": { "pass@1": 0.9575397559171598, "pass@2": 0.9706686361412663, "pass@4": 0.9781720887884576, "pass@8": 0.9836457175352227, "pass@16": 0.9877211930993773, "pass@32": 0.991030454111008, "pass@64": 0.9934560055493304, "pass@128": 0.9940828402366864 }, "8": { "pass@1": 0.9534755608974359, "pass@2": 0.9758759684786995, "pass@4": 0.9847553208493165, "pass@8": 0.9897007291187125, "pass@16": 0.9935694000922215, "pass@32": 0.9965151373391712, "pass@64": 0.998300039303945, "pass@128": 1.0 }, "9": { "pass@1": 0.9408450704225352, "pass@2": 0.9764160751913055, "pass@4": 0.9897490211610868, "pass@8": 0.9954928597779902, "pass@16": 0.9986146363237696, "pass@32": 0.9998472651017836, "pass@64": 0.9999990904114461, "pass@128": 1.0 } } }, "teachers_in_school": { "count": 390656, "correct": 376417, "answer_accuracy": 0.9635510525884666, "avg_response_len": 199.2523191759502, "resp_tokens_sum": 77839114, "pass_at_k": { "pass@1": 0.9635510525884666, "pass@2": 0.978026663489027, "pass@4": 0.9851850014079184, "pass@8": 0.9897531921524919, "pass@16": 0.9930249781079535, "pass@32": 0.9953635939722416, "pass@64": 0.9969830500068908, "pass@128": 0.9980340760157274 }, "per_op_pass_at_k": { "10": { "pass@1": 0.8878930214723927, "pass@2": 0.9240016333751996, "pass@4": 0.9427751138086263, "pass@8": 0.9559837013558444, "pass@16": 0.9669589357176731, "pass@32": 0.9757400698048759, "pass@64": 0.9820426961144421, "pass@128": 0.9877300613496932 }, "2": { "pass@1": 0.9987177051671733, "pass@2": 0.9996395064978579, "pass@4": 0.9999613177152248, "pass@8": 0.9999996232076699, "pass@16": 0.9999999999864305, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9907670454545454, "pass@2": 0.9960163892865667, "pass@4": 0.9985778993534897, "pass@8": 0.9996915957072215, "pass@16": 0.9999815253902985, "pass@32": 0.99999995938087, "pass@64": 0.9999999999999987, "pass@128": 1.0 }, "4": { "pass@1": 0.9982664571005917, "pass@2": 0.9998132687648512, "pass@4": 0.999995557138198, "pass@8": 0.9999999967656203, "pass@16": 0.9999999999999998, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "5": { "pass@1": 0.9600317028985508, "pass@2": 0.9691975493552438, "pass@4": 0.9763000869456536, "pass@8": 0.9828931121195829, "pass@16": 0.989036439654314, "pass@32": 0.9944838684919953, "pass@64": 0.9981653755234783, "pass@128": 1.0 }, "6": { "pass@1": 0.9823379297994269, "pass@2": 0.9910194058389551, "pass@4": 0.9946701218221643, "pass@8": 0.9965445299840145, "pass@16": 0.9976712739922713, "pass@32": 0.9983904225804993, "pass@64": 0.9992893080034889, "pass@128": 1.0 }, "7": { "pass@1": 0.9681855130057804, "pass@2": 0.9832421146056164, "pass@4": 0.9914642009163593, "pass@8": 0.9964758591421767, "pass@16": 0.9990374180123244, "pass@32": 0.9999126056145744, "pass@64": 0.9999997855847184, "pass@128": 1.0 }, "8": { "pass@1": 0.9455765845070423, "pass@2": 0.9712158284351782, "pass@4": 0.9835400283591312, "pass@8": 0.9910390348371692, "pass@16": 0.9955726997113149, "pass@32": 0.9980539040490879, "pass@64": 0.9995741614459047, "pass@128": 1.0 }, "9": { "pass@1": 0.9390437874251497, "pass@2": 0.9670264539818006, "pass@4": 0.9783135702722786, "pass@8": 0.9840906736702117, "pass@16": 0.9879928605789021, "pass@32": 0.9908513468886352, "pass@64": 0.9930496212972972, "pass@128": 0.9940119760479041 } } }, "movie_festival_awards": { "count": 382336, "correct": 368938, "answer_accuracy": 0.9649575242718447, "avg_response_len": 212.4594806662203, "resp_tokens_sum": 81230908, "pass_at_k": { "pass@1": 0.9649575242718447, "pass@2": 0.9800997069980434, "pass@4": 0.9876158778089633, "pass@8": 0.9919847109134324, "pass@16": 0.9947506625624947, "pass@32": 0.9965939401271129, "pass@64": 0.9979397963205583, "pass@128": 0.9989956478071644 }, "per_op_pass_at_k": { "10": { "pass@1": 0.8983908582089553, "pass@2": 0.9340132506757551, "pass@4": 0.9530650669599133, "pass@8": 0.9652801856862316, "pass@16": 0.9745693164759084, "pass@32": 0.9818461284408381, "pass@64": 0.9871328155545905, "pass@128": 0.991044776119403 }, "2": { "pass@1": 0.9998660714285714, "pass@2": 0.9999992969628797, "pass@4": 1.0, "pass@8": 1.0, "pass@16": 1.0, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9907069970845481, "pass@2": 0.9970669721769471, "pass@4": 0.9993563997831172, "pass@8": 0.9999514691641971, "pass@16": 0.9999996499221584, "pass@32": 0.9999999999946803, "pass@64": 1.0, "pass@128": 1.0 }, "4": { "pass@1": 0.9958196271929824, "pass@2": 0.9977559342911083, "pass@4": 0.9990211264710332, "pass@8": 0.9997663197662907, "pass@16": 0.9999856176620865, "pass@32": 0.9999999740185194, "pass@64": 0.9999999999999997, "pass@128": 1.0 }, "5": { "pass@1": 0.9548943014705882, "pass@2": 0.9685079174386292, "pass@4": 0.9784535587463331, "pass@8": 0.9869293632462349, "pass@16": 0.9934826994856965, "pass@32": 0.9970310861536675, "pass@64": 0.9985066302193607, "pass@128": 1.0 }, "6": { "pass@1": 0.9775483044164038, "pass@2": 0.9891293716932861, "pass@4": 0.9959075131381444, "pass@8": 0.9991092920224423, "pass@16": 0.9999352040645176, "pass@32": 0.9999996662295679, "pass@64": 0.9999999999997393, "pass@128": 1.0 }, "7": { "pass@1": 0.9744115901898734, "pass@2": 0.9867456923402768, "pass@4": 0.9912063912303684, "pass@8": 0.9935151685224703, "pass@16": 0.9954041601504686, "pass@32": 0.9972167179953726, "pass@64": 0.9990265956175474, "pass@128": 1.0 }, "8": { "pass@1": 0.9515531156156156, "pass@2": 0.9743795222387744, "pass@4": 0.9852795146102226, "pass@8": 0.9899431408996546, "pass@16": 0.9925003924523234, "pass@32": 0.9950902478963195, "pass@64": 0.9976578190444356, "pass@128": 1.0 }, "9": { "pass@1": 0.9379521704180064, "pass@2": 0.9720591912043948, "pass@4": 0.9858236963747689, "pass@8": 0.9932814976290268, "pass@16": 0.9968830356374474, "pass@32": 0.9981725276413653, "pass@64": 0.9992024635603814, "pass@128": 1.0 } } } } }, "ood": { "answer_accuracy": 0.25220703125, "avg_response_len": 284.85050390625, "avg_loss": 5.825457085227966, "count": 1280000, "validation_metrics": {}, "pass_at_k": { "pass@1": 0.25220703125, "pass@2": 0.3026185285433113, "pass@4": 0.34743800927071905, "pass@8": 0.38810959550625124, "pass@16": 0.42553433372274274, "pass@32": 0.4601746364651569, "pass@64": 0.49276390746466214, "pass@128": 0.5238 }, "per_op_pass_at_k": { "11": { "pass@1": 0.7491953125, "pass@2": 0.8267459399606308, "pass@4": 0.8755285344956877, "pass@8": 0.907201192042703, "pass@16": 0.9291708873567505, "pass@32": 0.9454073394478061, "pass@64": 0.9574549244959281, "pass@128": 0.967 }, "12": { "pass@1": 0.42375, "pass@2": 0.5180041830708659, "pass@4": 0.593009113235845, "pass@8": 0.6523426553791513, "pass@16": 0.702357863892847, "pass@32": 0.745694824334505, "pass@64": 0.7831121214310477, "pass@128": 0.816 }, "13": { "pass@1": 0.2113984375, "pass@2": 0.25608120078740143, "pass@4": 0.3006886397637793, "pass@8": 0.3447306794463938, "pass@16": 0.38799260684503434, "pass@32": 0.43170338099033495, "pass@64": 0.4761056180149899, "pass@128": 0.519 }, "14": { "pass@1": 0.200171875, "pass@2": 0.2479451279527559, "pass@4": 0.29500759055118103, "pass@8": 0.34113887630208256, "pass@16": 0.38481813556379674, "pass@32": 0.4260044402977936, "pass@64": 0.4671482956865662, "pass@128": 0.51 }, "15": { "pass@1": 0.17846875, "pass@2": 0.23154035433070885, "pass@4": 0.28250203355830483, "pass@8": 0.3294688278958133, "pass@16": 0.3735511399431241, "pass@32": 0.4152404384047447, "pass@64": 0.4535175002619446, "pass@128": 0.485 }, "16": { "pass@1": 0.158390625, "pass@2": 0.1908997293307087, "pass@4": 0.22120489895013098, "pass@8": 0.2515841984680904, "pass@16": 0.2832809698127581, "pass@32": 0.3157377911008275, "pass@64": 0.3483624231356763, "pass@128": 0.383 }, "17": { "pass@1": 0.15778125, "pass@2": 0.19822810039370062, "pass@4": 0.23773755933633278, "pass@8": 0.2756691021575299, "pass@16": 0.31012296220189073, "pass@32": 0.34113294654797527, "pass@64": 0.370247222269346, "pass@128": 0.399 }, "18": { "pass@1": 0.1607265625, "pass@2": 0.1996240157480314, "pass@4": 0.23766353665166848, "pass@8": 0.27507361063539393, "pass@16": 0.31047573197015055, "pass@32": 0.342168030163979, "pass@64": 0.37123232750738133, "pass@128": 0.4 }, "19": { "pass@1": 0.142859375, "pass@2": 0.17847379429133842, "pass@4": 0.21176526481064853, "pass@8": 0.24360403095909056, "pass@16": 0.27449561265010364, "pass@32": 0.305123051282576, "pass@64": 0.3363755503535671, "pass@128": 0.369 }, "20": { "pass@1": 0.139328125, "pass@2": 0.17864283956692892, "pass@4": 0.2192729213535806, "pass@8": 0.26028278177620573, "pass@16": 0.2990774269909148, "pass@32": 0.33353412208103916, "pass@64": 0.3640830914901839, "pass@128": 0.39 } }, "per_op_accuracy": { "11": 0.7491953125, "12": 0.42375, "13": 0.2113984375, "14": 0.200171875, "15": 0.17846875, "16": 0.158390625, "17": 0.15778125, "18": 0.1607265625, "19": 0.142859375, "20": 0.139328125 }, "per_op_avg_response_len": { "11": 291.4343515625, "12": 288.3101484375, "13": 282.0749140625, "14": 284.6130234375, "15": 282.3583984375, "16": 284.9855, "17": 285.6167265625, "18": 284.2999609375, "19": 281.3561640625, "20": 283.4558515625 }, "per_op_avg_loss": { "11": 0.1496517333984375, "12": 0.160320068359375, "13": 0.18829949951171876, "14": 0.2179671630859375, "15": 0.24814111328125, "16": 0.280107177734375, "17": 0.3107010498046875, "18": 0.3399879150390625, "19": 0.40024169921875, "20": 0.419079345703125 }, "per_op_length": { "11": 291.4343515625, "12": 288.3101484375, "13": 282.0749140625, "14": 284.6130234375, "15": 282.3583984375, "16": 284.9855, "17": 285.6167265625, "18": 284.2999609375, "19": 281.3561640625, "20": 283.4558515625 }, "per_op_loss": { "11": 0.1496517333984375, "12": 0.160320068359375, "13": 0.18829949951171876, "14": 0.2179671630859375, "15": 0.24814111328125, "16": 0.280107177734375, "17": 0.3107010498046875, "18": 0.3399879150390625, "19": 0.40024169921875, "20": 0.419079345703125 }, "per_template": { "crazy_zootopia": { "count": 430720, "correct": 113067, "answer_accuracy": 0.2625069650817236, "avg_response_len": 273.8302725668648, "resp_tokens_sum": 117944175, "pass_at_k": { "pass@1": 0.2625069650817236, "pass@2": 0.31366907342841527, "pass@4": 0.3583084888720263, "pass@8": 0.3983932569435031, "pass@16": 0.4355776055864389, "pass@32": 0.47017903805797134, "pass@64": 0.5024705719566711, "pass@128": 0.5337295690936107 }, "per_op_pass_at_k": { "11": { "pass@1": 0.7521689093484419, "pass@2": 0.8267172826838576, "pass@4": 0.8749963240253604, "pass@8": 0.9072539747181511, "pass@16": 0.9305891455160197, "pass@32": 0.9482171914771189, "pass@64": 0.9598296757598728, "pass@128": 0.9660056657223796 }, "12": { "pass@1": 0.4268626412429379, "pass@2": 0.5226548250589441, "pass@4": 0.5947451512946471, "pass@8": 0.6499540700619184, "pass@16": 0.697671407935311, "pass@32": 0.7391764349175001, "pass@64": 0.7728700105439386, "pass@128": 0.8022598870056498 }, "13": { "pass@1": 0.226048197492163, "pass@2": 0.2745025516500875, "pass@4": 0.32170087379359735, "pass@8": 0.36726090847720233, "pass@16": 0.4118837384913241, "pass@32": 0.4577964268716136, "pass@64": 0.5060004900301981, "pass@128": 0.554858934169279 }, "14": { "pass@1": 0.1970404984423676, "pass@2": 0.2447659871955257, "pass@4": 0.2905214605183697, "pass@8": 0.33432116638118264, "pass@16": 0.37689919457954874, "pass@32": 0.41849874775311363, "pass@64": 0.46201871752633594, "pass@128": 0.5109034267912772 }, "15": { "pass@1": 0.190774024566474, "pass@2": 0.24494681910245306, "pass@4": 0.29680882700356104, "pass@8": 0.3440605324830605, "pass@16": 0.38846742937996726, "pass@32": 0.43008799604881653, "pass@64": 0.4654618905817254, "pass@128": 0.4913294797687861 }, "16": { "pass@1": 0.1630796370967742, "pass@2": 0.1964297053594107, "pass@4": 0.23033826426938572, "pass@8": 0.2660781043146322, "pass@16": 0.3019958633560044, "pass@32": 0.33569879011745574, "pass@64": 0.3686862876847773, "pass@128": 0.4064516129032258 }, "17": { "pass@1": 0.1676300578034682, "pass@2": 0.20699551397296437, "pass@4": 0.2408987041359715, "pass@8": 0.271017564290959, "pass@16": 0.29876652985892266, "pass@32": 0.3264747152113181, "pass@64": 0.3549548508714773, "pass@128": 0.38439306358381503 }, "18": { "pass@1": 0.1616517857142857, "pass@2": 0.19595648200224966, "pass@4": 0.23106420322459695, "pass@8": 0.26888622832851616, "pass@16": 0.3071732341417159, "pass@32": 0.3420263056744111, "pass@64": 0.3714714335108666, "pass@128": 0.39714285714285713 }, "19": { "pass@1": 0.15040822072072071, "pass@2": 0.1915057281691139, "pass@4": 0.2273034600967671, "pass@8": 0.2598221642368353, "pass@16": 0.2905735616301428, "pass@32": 0.32042698971733125, "pass@64": 0.35124146512270843, "pass@128": 0.3843843843843844 }, "20": { "pass@1": 0.15052552552552553, "pass@2": 0.18885223806483653, "pass@4": 0.22942468621602477, "pass@8": 0.26979999128432536, "pass@16": 0.30689533475643643, "pass@32": 0.3397581402934893, "pass@64": 0.370991289408613, "pass@128": 0.4024024024024024 } } }, "teachers_in_school": { "count": 430464, "correct": 107386, "answer_accuracy": 0.24946569283377937, "avg_response_len": 281.4464322219744, "resp_tokens_sum": 121152557, "pass_at_k": { "pass@1": 0.24946569283377937, "pass@2": 0.30024583617224077, "pass@4": 0.34601646702614464, "pass@8": 0.3878270818735659, "pass@16": 0.42574861408875925, "pass@32": 0.4604748763552623, "pass@64": 0.492562625136877, "pass@128": 0.5212607790663099 }, "per_op_pass_at_k": { "11": { "pass@1": 0.7309864457831325, "pass@2": 0.811214839673655, "pass@4": 0.8645121111555325, "pass@8": 0.9001346785530842, "pass@16": 0.9235781504819824, "pass@32": 0.939017807747773, "pass@64": 0.9485292552001391, "pass@128": 0.9548192771084337 }, "12": { "pass@1": 0.439042907523511, "pass@2": 0.5317378835188704, "pass@4": 0.6075961422729685, "pass@8": 0.6683062360710545, "pass@16": 0.7177272091161964, "pass@32": 0.7598458853246168, "pass@64": 0.7942021269976, "pass@128": 0.8213166144200627 }, "13": { "pass@1": 0.19227065826330533, "pass@2": 0.23474754074858276, "pass@4": 0.27791213361985206, "pass@8": 0.32123267772057174, "pass@16": 0.36547236780648024, "pass@32": 0.412591392141311, "pass@64": 0.46083134462740805, "pass@128": 0.5042016806722689 }, "14": { "pass@1": 0.20837902046783627, "pass@2": 0.2522699670764838, "pass@4": 0.2964683594923442, "pass@8": 0.3416618793696011, "pass@16": 0.3841336779597966, "pass@32": 0.42375806001618826, "pass@64": 0.4632207639904677, "pass@128": 0.5029239766081871 }, "15": { "pass@1": 0.19093276515151514, "pass@2": 0.24199400501073715, "pass@4": 0.2891956014020975, "pass@8": 0.3326345282383242, "pass@16": 0.37366558323041027, "pass@32": 0.4128573661812689, "pass@64": 0.4483166035224402, "pass@128": 0.4727272727272727 }, "16": { "pass@1": 0.15642806267806267, "pass@2": 0.1904278691926329, "pass@4": 0.22132350069489173, "pass@8": 0.251268169779741, "pass@16": 0.28180434335829546, "pass@32": 0.3122157586188942, "pass@64": 0.34211985927431315, "pass@128": 0.3732193732193732 }, "17": { "pass@1": 0.16779891304347827, "pass@2": 0.21377104526336374, "pass@4": 0.2616579632708955, "pass@8": 0.3084358591590647, "pass@16": 0.3485515761094292, "pass@32": 0.38192734451252297, "pass@64": 0.4116145438455866, "pass@128": 0.43788819875776397 }, "18": { "pass@1": 0.1518612132352941, "pass@2": 0.1958187384205651, "pass@4": 0.23828893419572547, "pass@8": 0.27971765090438966, "pass@16": 0.31909231139592203, "pass@32": 0.3541919542192856, "pass@64": 0.3879892602530647, "pass@128": 0.4235294117647059 }, "19": { "pass@1": 0.14004371279761904, "pass@2": 0.17627703763592056, "pass@4": 0.21033196631671033, "pass@8": 0.24241868591666452, "pass@16": 0.2725452956766527, "pass@32": 0.3023936104609465, "pass@64": 0.331830773125219, "pass@128": 0.3601190476190476 }, "20": { "pass@1": 0.1367421407185629, "pass@2": 0.17771825934744678, "pass@4": 0.21898993408009634, "pass@8": 0.260066157867715, "pass@16": 0.2989195527821917, "pass@32": 0.3334463816599912, "pass@64": 0.3631967204725436, "pass@128": 0.38622754491017963 } } }, "movie_festival_awards": { "count": 418816, "correct": 102372, "answer_accuracy": 0.24443192237163813, "avg_response_len": 299.6827079194682, "resp_tokens_sum": 125511913, "pass_at_k": { "pass@1": 0.24443192237163813, "pass@2": 0.2936925752748208, "pass@4": 0.3377196360158527, "pass@8": 0.3778240125631929, "pass@16": 0.414985362239759, "pass@32": 0.44957728985444445, "pass@64": 0.48298823095266347, "pass@128": 0.51619804400978 }, "per_op_pass_at_k": { "11": { "pass@1": 0.7650545634920635, "pass@2": 0.8431473409573809, "pass@4": 0.8877359086066622, "pass@8": 0.9145899228176874, "pass@16": 0.9334761048558039, "pass@32": 0.9489928847114996, "pass@64": 0.9642010673849086, "pass@128": 0.9809523809523809 }, "12": { "pass@1": 0.4054615825688073, "pass@2": 0.49957183534397626, "pass@4": 0.5768995727598271, "pass@8": 0.639355428961972, "pass@16": 0.6924379381519293, "pass@32": 0.7389465717283092, "pass@64": 0.7833812207530858, "pass@128": 0.8256880733944955 }, "13": { "pass@1": 0.21805073302469136, "pass@2": 0.2614506628511712, "pass@4": 0.30509700407819385, "pass@8": 0.34843945585161107, "pass@16": 0.3892841325907048, "pass@32": 0.42707158587605587, "pass@64": 0.46350207303509766, "pass@128": 0.5 }, "14": { "pass@1": 0.19482566765578635, "pass@2": 0.24658432440010283, "pass@4": 0.2977982871762691, "pass@8": 0.34710213397424194, "pass@16": 0.39305571585018145, "pass@32": 0.4354334888531387, "pass@64": 0.4760201367236571, "pass@128": 0.516320474777448 }, "15": { "pass@1": 0.15263310185185186, "pass@2": 0.20657633724603863, "pass@4": 0.26040626837154607, "pass@8": 0.31066200412971595, "pass@16": 0.3575054534309268, "pass@32": 0.40181191645690056, "pass@64": 0.4460592807353767, "pass@128": 0.49074074074074076 }, "16": { "pass@1": 0.15613477138643067, "pass@2": 0.1863313822497852, "pass@4": 0.21273005393131156, "pass@8": 0.23865739981671225, "pass@16": 0.2676959517806339, "pass@32": 0.3011310763692745, "pass@64": 0.3362407178410369, "pass@128": 0.37168141592920356 }, "17": { "pass@1": 0.1378012048192771, "pass@2": 0.17401619272365054, "pass@4": 0.21124320340981484, "pass@8": 0.24873698272180494, "pass@16": 0.28468715470923933, "pass@32": 0.31684364503562207, "pass@64": 0.3460631347877581, "pass@128": 0.37650602409638556 }, "18": { "pass@1": 0.16940524193548387, "pass@2": 0.20793830962661927, "pass@4": 0.24442847708552565, "pass@8": 0.27696590133200394, "pass@16": 0.3047539165997947, "pass@32": 0.329140512075412, "pass@64": 0.3525837977178592, "pass@128": 0.3774193548387097 }, "19": { "pass@1": 0.13812311178247735, "pass@2": 0.16759305790137258, "pass@4": 0.19758813267676884, "pass@8": 0.2284911534750015, "pass@16": 0.2603002938970115, "pass@32": 0.29249731296624343, "pass@64": 0.3260332407783438, "pass@128": 0.36253776435045315 }, "20": { "pass@1": 0.13072447447447447, "pass@2": 0.16936079780567978, "pass@4": 0.2094049935762534, "pass@8": 0.25098284669882537, "pass@16": 0.291417867530838, "pass@32": 0.32739810777438544, "pass@64": 0.35806392636422313, "pass@128": 0.3813813813813814 } } } } }, "total": { "answer_accuracy": 0.588914884868421, "avg_loss": 6.166780625293129, "count": 2432000, "validation_metrics": { "id": {}, "ood": {} }, "pass_at_k": { "pass@1": 0.588914884868421, "pass@2": 0.6227466328221948, "pass@4": 0.6499032629408014, "pass@8": 0.6734744359367607, "pass@16": 0.6946532134389145, "pass@32": 0.7139263397862728, "pass@64": 0.7317913186635674, "pass@128": 0.748578947368421 }, "per_op_pass_at_k": { "10": { "pass@1": 0.89534375, "pass@2": 0.9316516978346466, "pass@4": 0.9502444778777662, "pass@8": 0.9628981587488723, "pass@16": 0.9732652153333301, "pass@32": 0.9817272204905513, "pass@64": 0.9875005300252477, "pass@128": 0.991 }, "2": { "pass@1": 0.9994453125, "pass@2": 0.9998784448818898, "pass@4": 0.999987270247469, "pass@8": 0.9999998760353234, "pass@16": 0.9999999999955357, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9896953125, "pass@2": 0.9956350885826775, "pass@4": 0.998029471128609, "pass@8": 0.9990296020970205, "pass@16": 0.9994722941030695, "pass@32": 0.999768854949753, "pass@64": 0.9999711811022791, "pass@128": 1.0 }, "4": { "pass@1": 0.9968125, "pass@2": 0.9986126968503938, "pass@4": 0.9994299414135731, "pass@8": 0.9998706652443888, "pass@16": 0.999993190320664, "pass@32": 0.9999999900668188, "pass@64": 0.9999999999999999, "pass@128": 1.0 }, "5": { "pass@1": 0.95453125, "pass@2": 0.9673741387795279, "pass@4": 0.9768552553430826, "pass@8": 0.9845504029004547, "pass@16": 0.9905860749032567, "pass@32": 0.9947039335217139, "pass@64": 0.9972231304095797, "pass@128": 0.999 }, "6": { "pass@1": 0.9755546875, "pass@2": 0.9871010088582685, "pass@4": 0.9931382349081366, "pass@8": 0.9961125829051054, "pass@16": 0.9972334916240602, "pass@32": 0.9976868579347262, "pass@64": 0.998251968426333, "pass@128": 0.999 }, "7": { "pass@1": 0.9665546875, "pass@2": 0.980099409448819, "pass@4": 0.9868899991563556, "pass@8": 0.9912036930431989, "pass@16": 0.9940644245074017, "pass@32": 0.9960585379187011, "pass@64": 0.9974804599031311, "pass@128": 0.998 }, "8": { "pass@1": 0.95003125, "pass@2": 0.9737233021653545, "pass@4": 0.984498448537683, "pass@8": 0.9902565507718185, "pass@16": 0.9939245919129133, "pass@32": 0.996586911336722, "pass@64": 0.998538493317924, "pass@128": 1.0 }, "9": { "pass@1": 0.93934375, "pass@2": 0.9719249507874034, "pass@4": 0.98470880455568, "pass@8": 0.990996795989665, "pass@16": 0.9945284354115372, "pass@32": 0.9963217850684021, "pass@64": 0.9974302167766392, "pass@128": 0.998 }, "11": { "pass@1": 0.7491953125, "pass@2": 0.8267459399606308, "pass@4": 0.8755285344956877, "pass@8": 0.907201192042703, "pass@16": 0.9291708873567505, "pass@32": 0.9454073394478061, "pass@64": 0.9574549244959281, "pass@128": 0.967 }, "12": { "pass@1": 0.42375, "pass@2": 0.5180041830708659, "pass@4": 0.593009113235845, "pass@8": 0.6523426553791513, "pass@16": 0.702357863892847, "pass@32": 0.745694824334505, "pass@64": 0.7831121214310477, "pass@128": 0.816 }, "13": { "pass@1": 0.2113984375, "pass@2": 0.25608120078740143, "pass@4": 0.3006886397637793, "pass@8": 0.3447306794463938, "pass@16": 0.38799260684503434, "pass@32": 0.43170338099033495, "pass@64": 0.4761056180149899, "pass@128": 0.519 }, "14": { "pass@1": 0.200171875, "pass@2": 0.2479451279527559, "pass@4": 0.29500759055118103, "pass@8": 0.34113887630208256, "pass@16": 0.38481813556379674, "pass@32": 0.4260044402977936, "pass@64": 0.4671482956865662, "pass@128": 0.51 }, "15": { "pass@1": 0.17846875, "pass@2": 0.23154035433070885, "pass@4": 0.28250203355830483, "pass@8": 0.3294688278958133, "pass@16": 0.3735511399431241, "pass@32": 0.4152404384047447, "pass@64": 0.4535175002619446, "pass@128": 0.485 }, "16": { "pass@1": 0.158390625, "pass@2": 0.1908997293307087, "pass@4": 0.22120489895013098, "pass@8": 0.2515841984680904, "pass@16": 0.2832809698127581, "pass@32": 0.3157377911008275, "pass@64": 0.3483624231356763, "pass@128": 0.383 }, "17": { "pass@1": 0.15778125, "pass@2": 0.19822810039370062, "pass@4": 0.23773755933633278, "pass@8": 0.2756691021575299, "pass@16": 0.31012296220189073, "pass@32": 0.34113294654797527, "pass@64": 0.370247222269346, "pass@128": 0.399 }, "18": { "pass@1": 0.1607265625, "pass@2": 0.1996240157480314, "pass@4": 0.23766353665166848, "pass@8": 0.27507361063539393, "pass@16": 0.31047573197015055, "pass@32": 0.342168030163979, "pass@64": 0.37123232750738133, "pass@128": 0.4 }, "19": { "pass@1": 0.142859375, "pass@2": 0.17847379429133842, "pass@4": 0.21176526481064853, "pass@8": 0.24360403095909056, "pass@16": 0.27449561265010364, "pass@32": 0.305123051282576, "pass@64": 0.3363755503535671, "pass@128": 0.369 }, "20": { "pass@1": 0.139328125, "pass@2": 0.17864283956692892, "pass@4": 0.2192729213535806, "pass@8": 0.26028278177620573, "pass@16": 0.2990774269909148, "pass@32": 0.33353412208103916, "pass@64": 0.3640830914901839, "pass@128": 0.39 } }, "per_op_accuracy": { "10": 0.89534375, "2": 0.9994453125, "3": 0.9896953125, "4": 0.9968125, "5": 0.95453125, "6": 0.9755546875, "7": 0.9665546875, "8": 0.95003125, "9": 0.93934375, "11": 0.7491953125, "12": 0.42375, "13": 0.2113984375, "14": 0.200171875, "15": 0.17846875, "16": 0.158390625, "17": 0.15778125, "18": 0.1607265625, "19": 0.142859375, "20": 0.139328125 }, "per_op_avg_response_len": { "10": 279.685734375, "2": 116.979234375, "3": 147.114546875, "4": 160.9594765625, "5": 176.0514453125, "6": 200.8557890625, "7": 225.0676796875, "8": 244.2956640625, "9": 259.9049921875, "11": 291.4343515625, "12": 288.3101484375, "13": 282.0749140625, "14": 284.6130234375, "15": 282.3583984375, "16": 284.9855, "17": 285.6167265625, "18": 284.2999609375, "19": 281.3561640625, "20": 283.4558515625 }, "per_op_avg_loss": { "10": 0.13990753173828124, "2": 0.15822406005859374, "3": 0.15319696044921874, "4": 0.1460938720703125, "5": 0.1487813720703125, "6": 0.14940606689453126, "7": 0.14932769775390625, "8": 0.14807330322265624, "9": 0.1509442138671875, "11": 0.1496517333984375, "12": 0.160320068359375, "13": 0.18829949951171876, "14": 0.2179671630859375, "15": 0.24814111328125, "16": 0.280107177734375, "17": 0.3107010498046875, "18": 0.3399879150390625, "19": 0.40024169921875, "20": 0.419079345703125 }, "per_op_length": { "10": 279.685734375, "2": 116.979234375, "3": 147.114546875, "4": 160.9594765625, "5": 176.0514453125, "6": 200.8557890625, "7": 225.0676796875, "8": 244.2956640625, "9": 259.9049921875, "11": 291.4343515625, "12": 288.3101484375, "13": 282.0749140625, "14": 284.6130234375, "15": 282.3583984375, "16": 284.9855, "17": 285.6167265625, "18": 284.2999609375, "19": 281.3561640625, "20": 283.4558515625 }, "per_op_loss": { "10": 0.13990753173828124, "2": 0.15822406005859374, "3": 0.15319696044921874, "4": 0.1460938720703125, "5": 0.1487813720703125, "6": 0.14940606689453126, "7": 0.14932769775390625, "8": 0.14807330322265624, "9": 0.1509442138671875, "11": 0.1496517333984375, "12": 0.160320068359375, "13": 0.18829949951171876, "14": 0.2179671630859375, "15": 0.24814111328125, "16": 0.280107177734375, "17": 0.3107010498046875, "18": 0.3399879150390625, "19": 0.40024169921875, "20": 0.419079345703125 }, "per_template": { "crazy_zootopia": { "count": 809728, "correct": 477128, "answer_accuracy": 0.5892447834334492, "avg_response_len": 235.475637498024, "resp_tokens_sum": 190671217, "pass_at_k": { "pass@1": 0.5892447834334492, "pass@2": 0.6242494417489666, "pass@4": 0.6517067209785661, "pass@8": 0.6752650274111465, "pass@16": 0.6966098480320178, "pass@32": 0.7161446042348242, "pass@64": 0.7340453705919688, "pass@128": 0.7510275055327221 }, "per_op_pass_at_k": { "10": { "pass@1": 0.8994976032448377, "pass@2": 0.9366747090795067, "pass@4": 0.954640098361598, "pass@8": 0.9671935395338649, "pass@16": 0.9780409476989385, "pass@32": 0.9873672705205916, "pass@64": 0.9931124421272916, "pass@128": 0.9941002949852508 }, "2": { "pass@1": 0.9997322819314641, "pass@2": 0.9999915679348491, "pass@4": 0.9999999897793149, "pass@8": 1.0, "pass@16": 1.0, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9875525611620795, "pass@2": 0.9937483445303281, "pass@4": 0.9960841566249173, "pass@8": 0.9973945614382809, "pass@16": 0.9984052318072489, "pass@32": 0.9992931754002767, "pass@64": 0.9999118688143109, "pass@128": 1.0 }, "4": { "pass@1": 0.996337890625, "pass@2": 0.9982602577509845, "pass@4": 0.9992694307742784, "pass@8": 0.9998455780548049, "pass@16": 0.9999940908757206, "pass@32": 0.9999999967265157, "pass@64": 1.0, "pass@128": 1.0 }, "5": { "pass@1": 0.9481150793650793, "pass@2": 0.9641533089613797, "pass@4": 0.9757381440415184, "pass@8": 0.9837977641761237, "pass@16": 0.9891567790393069, "pass@32": 0.9924331098404072, "pass@64": 0.9948057827917357, "pass@128": 0.9968253968253968 }, "6": { "pass@1": 0.9665746631736527, "pass@2": 0.9810815131547932, "pass@4": 0.9889092237347574, "pass@8": 0.992817054998713, "pass@16": 0.9942118482404367, "pass@32": 0.9947566055669427, "pass@64": 0.9955089818359227, "pass@128": 0.9970059880239521 }, "7": { "pass@1": 0.9575397559171598, "pass@2": 0.9706686361412663, "pass@4": 0.9781720887884576, "pass@8": 0.9836457175352227, "pass@16": 0.9877211930993773, "pass@32": 0.991030454111008, "pass@64": 0.9934560055493304, "pass@128": 0.9940828402366864 }, "8": { "pass@1": 0.9534755608974359, "pass@2": 0.9758759684786995, "pass@4": 0.9847553208493165, "pass@8": 0.9897007291187125, "pass@16": 0.9935694000922215, "pass@32": 0.9965151373391712, "pass@64": 0.998300039303945, "pass@128": 1.0 }, "9": { "pass@1": 0.9408450704225352, "pass@2": 0.9764160751913055, "pass@4": 0.9897490211610868, "pass@8": 0.9954928597779902, "pass@16": 0.9986146363237696, "pass@32": 0.9998472651017836, "pass@64": 0.9999990904114461, "pass@128": 1.0 }, "11": { "pass@1": 0.7521689093484419, "pass@2": 0.8267172826838576, "pass@4": 0.8749963240253604, "pass@8": 0.9072539747181511, "pass@16": 0.9305891455160197, "pass@32": 0.9482171914771189, "pass@64": 0.9598296757598728, "pass@128": 0.9660056657223796 }, "12": { "pass@1": 0.4268626412429379, "pass@2": 0.5226548250589441, "pass@4": 0.5947451512946471, "pass@8": 0.6499540700619184, "pass@16": 0.697671407935311, "pass@32": 0.7391764349175001, "pass@64": 0.7728700105439386, "pass@128": 0.8022598870056498 }, "13": { "pass@1": 0.226048197492163, "pass@2": 0.2745025516500875, "pass@4": 0.32170087379359735, "pass@8": 0.36726090847720233, "pass@16": 0.4118837384913241, "pass@32": 0.4577964268716136, "pass@64": 0.5060004900301981, "pass@128": 0.554858934169279 }, "14": { "pass@1": 0.1970404984423676, "pass@2": 0.2447659871955257, "pass@4": 0.2905214605183697, "pass@8": 0.33432116638118264, "pass@16": 0.37689919457954874, "pass@32": 0.41849874775311363, "pass@64": 0.46201871752633594, "pass@128": 0.5109034267912772 }, "15": { "pass@1": 0.190774024566474, "pass@2": 0.24494681910245306, "pass@4": 0.29680882700356104, "pass@8": 0.3440605324830605, "pass@16": 0.38846742937996726, "pass@32": 0.43008799604881653, "pass@64": 0.4654618905817254, "pass@128": 0.4913294797687861 }, "16": { "pass@1": 0.1630796370967742, "pass@2": 0.1964297053594107, "pass@4": 0.23033826426938572, "pass@8": 0.2660781043146322, "pass@16": 0.3019958633560044, "pass@32": 0.33569879011745574, "pass@64": 0.3686862876847773, "pass@128": 0.4064516129032258 }, "17": { "pass@1": 0.1676300578034682, "pass@2": 0.20699551397296437, "pass@4": 0.2408987041359715, "pass@8": 0.271017564290959, "pass@16": 0.29876652985892266, "pass@32": 0.3264747152113181, "pass@64": 0.3549548508714773, "pass@128": 0.38439306358381503 }, "18": { "pass@1": 0.1616517857142857, "pass@2": 0.19595648200224966, "pass@4": 0.23106420322459695, "pass@8": 0.26888622832851616, "pass@16": 0.3071732341417159, "pass@32": 0.3420263056744111, "pass@64": 0.3714714335108666, "pass@128": 0.39714285714285713 }, "19": { "pass@1": 0.15040822072072071, "pass@2": 0.1915057281691139, "pass@4": 0.2273034600967671, "pass@8": 0.2598221642368353, "pass@16": 0.2905735616301428, "pass@32": 0.32042698971733125, "pass@64": 0.35124146512270843, "pass@128": 0.3843843843843844 }, "20": { "pass@1": 0.15052552552552553, "pass@2": 0.18885223806483653, "pass@4": 0.22942468621602477, "pass@8": 0.26979999128432536, "pass@16": 0.30689533475643643, "pass@32": 0.3397581402934893, "pass@64": 0.370991289408613, "pass@128": 0.4024024024024024 } } }, "teachers_in_school": { "count": 821120, "correct": 483803, "answer_accuracy": 0.5891988990646921, "avg_response_len": 242.341766124318, "resp_tokens_sum": 198991671, "pass_at_k": { "pass@1": 0.5891988990646921, "pass@2": 0.6227068003142299, "pass@4": 0.650107249088997, "pass@8": 0.6741994105674554, "pass@16": 0.6956359816626693, "pass@32": 0.7149534992963424, "pass@64": 0.7325456550204769, "pass@128": 0.7480904130943102 }, "per_op_pass_at_k": { "10": { "pass@1": 0.8878930214723927, "pass@2": 0.9240016333751996, "pass@4": 0.9427751138086263, "pass@8": 0.9559837013558444, "pass@16": 0.9669589357176731, "pass@32": 0.9757400698048759, "pass@64": 0.9820426961144421, "pass@128": 0.9877300613496932 }, "2": { "pass@1": 0.9987177051671733, "pass@2": 0.9996395064978579, "pass@4": 0.9999613177152248, "pass@8": 0.9999996232076699, "pass@16": 0.9999999999864305, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9907670454545454, "pass@2": 0.9960163892865667, "pass@4": 0.9985778993534897, "pass@8": 0.9996915957072215, "pass@16": 0.9999815253902985, "pass@32": 0.99999995938087, "pass@64": 0.9999999999999987, "pass@128": 1.0 }, "4": { "pass@1": 0.9982664571005917, "pass@2": 0.9998132687648512, "pass@4": 0.999995557138198, "pass@8": 0.9999999967656203, "pass@16": 0.9999999999999998, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "5": { "pass@1": 0.9600317028985508, "pass@2": 0.9691975493552438, "pass@4": 0.9763000869456536, "pass@8": 0.9828931121195829, "pass@16": 0.989036439654314, "pass@32": 0.9944838684919953, "pass@64": 0.9981653755234783, "pass@128": 1.0 }, "6": { "pass@1": 0.9823379297994269, "pass@2": 0.9910194058389551, "pass@4": 0.9946701218221643, "pass@8": 0.9965445299840145, "pass@16": 0.9976712739922713, "pass@32": 0.9983904225804993, "pass@64": 0.9992893080034889, "pass@128": 1.0 }, "7": { "pass@1": 0.9681855130057804, "pass@2": 0.9832421146056164, "pass@4": 0.9914642009163593, "pass@8": 0.9964758591421767, "pass@16": 0.9990374180123244, "pass@32": 0.9999126056145744, "pass@64": 0.9999997855847184, "pass@128": 1.0 }, "8": { "pass@1": 0.9455765845070423, "pass@2": 0.9712158284351782, "pass@4": 0.9835400283591312, "pass@8": 0.9910390348371692, "pass@16": 0.9955726997113149, "pass@32": 0.9980539040490879, "pass@64": 0.9995741614459047, "pass@128": 1.0 }, "9": { "pass@1": 0.9390437874251497, "pass@2": 0.9670264539818006, "pass@4": 0.9783135702722786, "pass@8": 0.9840906736702117, "pass@16": 0.9879928605789021, "pass@32": 0.9908513468886352, "pass@64": 0.9930496212972972, "pass@128": 0.9940119760479041 }, "11": { "pass@1": 0.7309864457831325, "pass@2": 0.811214839673655, "pass@4": 0.8645121111555325, "pass@8": 0.9001346785530842, "pass@16": 0.9235781504819824, "pass@32": 0.939017807747773, "pass@64": 0.9485292552001391, "pass@128": 0.9548192771084337 }, "12": { "pass@1": 0.439042907523511, "pass@2": 0.5317378835188704, "pass@4": 0.6075961422729685, "pass@8": 0.6683062360710545, "pass@16": 0.7177272091161964, "pass@32": 0.7598458853246168, "pass@64": 0.7942021269976, "pass@128": 0.8213166144200627 }, "13": { "pass@1": 0.19227065826330533, "pass@2": 0.23474754074858276, "pass@4": 0.27791213361985206, "pass@8": 0.32123267772057174, "pass@16": 0.36547236780648024, "pass@32": 0.412591392141311, "pass@64": 0.46083134462740805, "pass@128": 0.5042016806722689 }, "14": { "pass@1": 0.20837902046783627, "pass@2": 0.2522699670764838, "pass@4": 0.2964683594923442, "pass@8": 0.3416618793696011, "pass@16": 0.3841336779597966, "pass@32": 0.42375806001618826, "pass@64": 0.4632207639904677, "pass@128": 0.5029239766081871 }, "15": { "pass@1": 0.19093276515151514, "pass@2": 0.24199400501073715, "pass@4": 0.2891956014020975, "pass@8": 0.3326345282383242, "pass@16": 0.37366558323041027, "pass@32": 0.4128573661812689, "pass@64": 0.4483166035224402, "pass@128": 0.4727272727272727 }, "16": { "pass@1": 0.15642806267806267, "pass@2": 0.1904278691926329, "pass@4": 0.22132350069489173, "pass@8": 0.251268169779741, "pass@16": 0.28180434335829546, "pass@32": 0.3122157586188942, "pass@64": 0.34211985927431315, "pass@128": 0.3732193732193732 }, "17": { "pass@1": 0.16779891304347827, "pass@2": 0.21377104526336374, "pass@4": 0.2616579632708955, "pass@8": 0.3084358591590647, "pass@16": 0.3485515761094292, "pass@32": 0.38192734451252297, "pass@64": 0.4116145438455866, "pass@128": 0.43788819875776397 }, "18": { "pass@1": 0.1518612132352941, "pass@2": 0.1958187384205651, "pass@4": 0.23828893419572547, "pass@8": 0.27971765090438966, "pass@16": 0.31909231139592203, "pass@32": 0.3541919542192856, "pass@64": 0.3879892602530647, "pass@128": 0.4235294117647059 }, "19": { "pass@1": 0.14004371279761904, "pass@2": 0.17627703763592056, "pass@4": 0.21033196631671033, "pass@8": 0.24241868591666452, "pass@16": 0.2725452956766527, "pass@32": 0.3023936104609465, "pass@64": 0.331830773125219, "pass@128": 0.3601190476190476 }, "20": { "pass@1": 0.1367421407185629, "pass@2": 0.17771825934744678, "pass@4": 0.21898993408009634, "pass@8": 0.260066157867715, "pass@16": 0.2989195527821917, "pass@32": 0.3334463816599912, "pass@64": 0.3631967204725436, "pass@128": 0.38622754491017963 } } }, "movie_festival_awards": { "count": 801152, "correct": 471310, "answer_accuracy": 0.5882903618788944, "avg_response_len": 258.0569242790382, "resp_tokens_sum": 206742821, "pass_at_k": { "pass@1": 0.5882903618788944, "pass@2": 0.6212685622467475, "pass@4": 0.6478714293112718, "pass@8": 0.6709216329453926, "pass@16": 0.6916683710373397, "pass@32": 0.7106315691905148, "pass@64": 0.728740000525103, "pass@128": 0.7466048889598977 }, "per_op_pass_at_k": { "10": { "pass@1": 0.8983908582089553, "pass@2": 0.9340132506757551, "pass@4": 0.9530650669599133, "pass@8": 0.9652801856862316, "pass@16": 0.9745693164759084, "pass@32": 0.9818461284408381, "pass@64": 0.9871328155545905, "pass@128": 0.991044776119403 }, "2": { "pass@1": 0.9998660714285714, "pass@2": 0.9999992969628797, "pass@4": 1.0, "pass@8": 1.0, "pass@16": 1.0, "pass@32": 1.0, "pass@64": 1.0, "pass@128": 1.0 }, "3": { "pass@1": 0.9907069970845481, "pass@2": 0.9970669721769471, "pass@4": 0.9993563997831172, "pass@8": 0.9999514691641971, "pass@16": 0.9999996499221584, "pass@32": 0.9999999999946803, "pass@64": 1.0, "pass@128": 1.0 }, "4": { "pass@1": 0.9958196271929824, "pass@2": 0.9977559342911083, "pass@4": 0.9990211264710332, "pass@8": 0.9997663197662907, "pass@16": 0.9999856176620865, "pass@32": 0.9999999740185194, "pass@64": 0.9999999999999997, "pass@128": 1.0 }, "5": { "pass@1": 0.9548943014705882, "pass@2": 0.9685079174386292, "pass@4": 0.9784535587463331, "pass@8": 0.9869293632462349, "pass@16": 0.9934826994856965, "pass@32": 0.9970310861536675, "pass@64": 0.9985066302193607, "pass@128": 1.0 }, "6": { "pass@1": 0.9775483044164038, "pass@2": 0.9891293716932861, "pass@4": 0.9959075131381444, "pass@8": 0.9991092920224423, "pass@16": 0.9999352040645176, "pass@32": 0.9999996662295679, "pass@64": 0.9999999999997393, "pass@128": 1.0 }, "7": { "pass@1": 0.9744115901898734, "pass@2": 0.9867456923402768, "pass@4": 0.9912063912303684, "pass@8": 0.9935151685224703, "pass@16": 0.9954041601504686, "pass@32": 0.9972167179953726, "pass@64": 0.9990265956175474, "pass@128": 1.0 }, "8": { "pass@1": 0.9515531156156156, "pass@2": 0.9743795222387744, "pass@4": 0.9852795146102226, "pass@8": 0.9899431408996546, "pass@16": 0.9925003924523234, "pass@32": 0.9950902478963195, "pass@64": 0.9976578190444356, "pass@128": 1.0 }, "9": { "pass@1": 0.9379521704180064, "pass@2": 0.9720591912043948, "pass@4": 0.9858236963747689, "pass@8": 0.9932814976290268, "pass@16": 0.9968830356374474, "pass@32": 0.9981725276413653, "pass@64": 0.9992024635603814, "pass@128": 1.0 }, "11": { "pass@1": 0.7650545634920635, "pass@2": 0.8431473409573809, "pass@4": 0.8877359086066622, "pass@8": 0.9145899228176874, "pass@16": 0.9334761048558039, "pass@32": 0.9489928847114996, "pass@64": 0.9642010673849086, "pass@128": 0.9809523809523809 }, "12": { "pass@1": 0.4054615825688073, "pass@2": 0.49957183534397626, "pass@4": 0.5768995727598271, "pass@8": 0.639355428961972, "pass@16": 0.6924379381519293, "pass@32": 0.7389465717283092, "pass@64": 0.7833812207530858, "pass@128": 0.8256880733944955 }, "13": { "pass@1": 0.21805073302469136, "pass@2": 0.2614506628511712, "pass@4": 0.30509700407819385, "pass@8": 0.34843945585161107, "pass@16": 0.3892841325907048, "pass@32": 0.42707158587605587, "pass@64": 0.46350207303509766, "pass@128": 0.5 }, "14": { "pass@1": 0.19482566765578635, "pass@2": 0.24658432440010283, "pass@4": 0.2977982871762691, "pass@8": 0.34710213397424194, "pass@16": 0.39305571585018145, "pass@32": 0.4354334888531387, "pass@64": 0.4760201367236571, "pass@128": 0.516320474777448 }, "15": { "pass@1": 0.15263310185185186, "pass@2": 0.20657633724603863, "pass@4": 0.26040626837154607, "pass@8": 0.31066200412971595, "pass@16": 0.3575054534309268, "pass@32": 0.40181191645690056, "pass@64": 0.4460592807353767, "pass@128": 0.49074074074074076 }, "16": { "pass@1": 0.15613477138643067, "pass@2": 0.1863313822497852, "pass@4": 0.21273005393131156, "pass@8": 0.23865739981671225, "pass@16": 0.2676959517806339, "pass@32": 0.3011310763692745, "pass@64": 0.3362407178410369, "pass@128": 0.37168141592920356 }, "17": { "pass@1": 0.1378012048192771, "pass@2": 0.17401619272365054, "pass@4": 0.21124320340981484, "pass@8": 0.24873698272180494, "pass@16": 0.28468715470923933, "pass@32": 0.31684364503562207, "pass@64": 0.3460631347877581, "pass@128": 0.37650602409638556 }, "18": { "pass@1": 0.16940524193548387, "pass@2": 0.20793830962661927, "pass@4": 0.24442847708552565, "pass@8": 0.27696590133200394, "pass@16": 0.3047539165997947, "pass@32": 0.329140512075412, "pass@64": 0.3525837977178592, "pass@128": 0.3774193548387097 }, "19": { "pass@1": 0.13812311178247735, "pass@2": 0.16759305790137258, "pass@4": 0.19758813267676884, "pass@8": 0.2284911534750015, "pass@16": 0.2603002938970115, "pass@32": 0.29249731296624343, "pass@64": 0.3260332407783438, "pass@128": 0.36253776435045315 }, "20": { "pass@1": 0.13072447447447447, "pass@2": 0.16936079780567978, "pass@4": 0.2094049935762534, "pass@8": 0.25098284669882537, "pass@16": 0.291417867530838, "pass@32": 0.32739810777438544, "pass@64": 0.35806392636422313, "pass@128": 0.3813813813813814 } } } } } }