commit f99638682e5dca46b78cf7a19f28c1620b2c9769 Author: ModelHub XC Date: Fri May 22 12:00:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: ai-forever/Pollux-4B-Judge Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..5d1cc9b --- /dev/null +++ b/README.md @@ -0,0 +1,259 @@ +--- +license: mit +language: +- ru +base_model: +- Qwen/Qwen3-4B +pipeline_tag: text-generation +library_name: transformers +tags: +- pytorch +metrics: +- rmse +- pearsonr +- f1 +--- +# Pollux-4B-Judge + + + +![banner](logo_pollux_horiz_short_WHITEBG.png) + +Pollux-4B-Judge is a 4-billion parameter generative language model specifically designed to evaluate the quality of other language models' responses in Russian. +The model assesses answer quality given input instruction, specific criteria and rubrics, providing automated LLM performance evaluation for Russian-language tasks. + +## Model Details + +### Model Description + + + +Pollux-4B-Judge is an integral component of the POLLUX project, a comprehensive initiative dedicated to evaluating the generative capabilities of Large Language Models (LLMs). +At the heart of this project lies the [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B), which introduces systematic taxonomies for both generative tasks and evaluation criteria, providing quantitative and qualitative assessments of responses from top-tier LLMs. + +Built upon the [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) architecture, Pollux-4B-Judge is a decoder-based 4 billion parameter model trained in a sequence-to-sequence fashion. +The model is designed to predict both numerical scores and detailed textual rationales based on the original instruction, the LLM's response, specific evaluation criteria, scoring rubrics, and reference answers when available. + +While the model is technically capable of processing any type of instruction and criterion when properly formatted, its training has been specifically optimized using the generative tasks and evaluation criteria derived from the taxonomies established within the [POLLUX dataset](https://huggingface.co/datasets/ai-forever/POLLUX). + + +- **Model type:** decoder +- **Language(s) (NLP):** Russian +- **License:** MIT +- **Finetuned from model:** [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) + +### Model Sources + + + +- **Repository:** [POLLUX code base](https://github.com/ai-forever/POLLUX) +- **Paper:** [ArXiv preprint](https://arxiv.org/pdf/2505.24616) + +## Uses + + + +### Direct Use + + + +Pollux-4B-Judge is specifically designed for assessing text responses against a single, predefined criterion per evaluation run. +The model operates optimally when provided with all essential components: the source instruction, the response to be evaluated (typically generated by another LLM), the specific evaluation criterion, and its corresponding scoring rubrics. + + +### Out-of-Scope Use + + + +While the model may **technically** process multiple criteria simultaneously, such usage falls outside its intended design and may yield unpredictable results. +Similarly, the model is not designed to function autonomously in determining appropriate evaluation criteria—it requires explicit criterion specification to perform reliable assessments. + +For optimal performance and reliable results, users should structure each evaluation session around one criterion at a time, providing all necessary contextual components to enable the model's comprehensive scoring and rationale generation capabilities. + + +## MODEL OUTPUT DISCLAIMER AND LIMITATION OF LIABILITY + + + +All content, responses, and outputs generated by Pollux-4B-Judge (the "Model") are produced through automated computational processes based on statistical patterns learned from pre-training data. +Such outputs do not constitute statements, opinions, recommendations, or positions of the model developers, publishers, or affiliated entities (collectively, the "Developers"). + +The Model's outputs do not represent, reflect, or endorse any views, beliefs, policies, or positions held by the Developers. +Generated content should not be interpreted as official statements, advice, or guidance from the Developers. + +While the Developers employed appropriate data curation practices during fine-tuning and avoided the intentional inclusion of inappropriate content, the Model's responses may reflect patterns present in the underlying pre-training datasets, which were sourced from publicly available internet content and other large-scale text corpora. + +The Developers expressly disclaim responsibility for any content generated by the Model. Users acknowledge that: +- Generated outputs are probabilistic and may contain inaccuracies, biases, or inappropriate content +- The Developers cannot guarantee the accuracy, completeness, or appropriateness of any Model output +- Users assume full responsibility for evaluating and using Model-generated content + +Users are solely responsible for reviewing, validating, and determining the appropriateness of any Model-generated content before use or distribution. + + +## How to Get Started with the Model + +Install the required packages: + +```bash +pip install vllm==0.19.0 +pip install transformers==5.3.0 +pip install openai +``` + +Start the vLLM OpenAI-compatible server: + +```bash +vllm serve ai-forever/Pollux-4B-Judge --tensor-parallel-size 1 --reasoning-parser qwen3 +``` + +Use the code below to send requests to the running server: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +PROMPT_TEMPLATE = """### Задание для оценки: +{instruction} + +### Эталонный ответ: +{reference_answer} + +### Ответ для оценки: +{answer} + +### Критерий оценки: +{criteria_name} + +### Шкала оценивания по критерию: +{criteria_rubrics} +""" + +instruction = "Сколько будет 2+2?" +reference_answer = "4" +answer = "Будет 4" +criteria_name = "Правильность ответа" +criteria_rubrics = """0: Дан неправильный ответ или ответ отсутствует. + +1: Ответ модели неполный (не на все вопросы задания получен ответ, в формулировке ответа отсутствует часть информации). + +2: Ответ модели совпадает с эталонным или эквивалентен ему.""" + +prompt = PROMPT_TEMPLATE.format( + instruction=instruction, + reference_answer=reference_answer, + answer=answer, + criteria_name=criteria_name, + criteria_rubrics=criteria_rubrics, +) +response = client.chat.completions.create( + model="ai-forever/Pollux-4B-Judge", + messages=[{"role": "user", "content": prompt}], + max_tokens=512, + temperature=0.0, +) + +score = response.choices[0].message.content.strip() +reasoning = response.choices[0].message.reasoning + +print(score) +print(reasoning) +``` + +## Training Details + +### Training Data + + + +Given the substantial time investment required for manual dataset creation—approximately 24,447 hours for the [POLLUX dataset](https://huggingface.co/datasets/ai-forever/POLLUX)—we opted to employ synthetic data for training purposes, as acquiring a manually composed training set of comparable size was not feasible. + +Our synthetic data generation process proceeded in several stages. +Initially, we generated 78,000 instructions using three state-of-the-art language models: [DeepSeekV3](https://huggingface.co/deepseek-ai/DeepSeek-V3), [OpenAI GPT-4o](https://openai.com/index/hello-gpt-4o/), and [o3-mini](https://openai.com/index/openai-o3-mini/), with each model contributing equally to the instruction pool. +These instructions were based on the POLLUX tasks taxonomy and complexity levels to ensure consistency with the original framework. +Training data does not include Recommendations, Applied Brainstorming, Literary Text Generation, Questions Generation, Style +Transfer, Code Modification, and AI as a Character tasks alongside corresponding Task-specific criteria to enable out-of-domain evaluation of the resulting LM-as-a-Judge model. +To maintain data quality, we implemented filtering procedure that removed instructions containing more than 5% non-Russian tokens as well as duplicate entries, ultimately yielding a refined set of 26,000 high-quality instructions. + +Subsequently, we mapped these synthetic instructions to their corresponding evaluation criteria sets using the same algorithm employed in the original [POLLUX dataset](https://huggingface.co/datasets/ai-forever/POLLUX). +Each criteria set comprised Critical, General, Subjective, and relevant Domain- and Task-specific criteria (for detailed methodology, see Section 2.3 in the [preprint](https://arxiv.org/pdf/2505.24616)). +To generate diverse responses, we employed 15 open-source language models from various families, including Llama, Phi, Qwen, Mistral, and Gemma, with each model contributing equally to the answer generation process, for the complete listing of the models see Appendix M.2 in the [preprint](https://arxiv.org/pdf/2505.24616)). + +For criteria annotation, we utilized [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which generated numerical scores based on established criterion rubrics along with corresponding rationales for each evaluation. +This systematic approach resulted in 8,000,000 samples, each containing the complete tuple of (instruction, answer, criterion, score, rationale). +From this dataset, we performed stratified random sampling across tasks to obtain our final training set of 1,000,000 samples, ensuring balanced representation across different task categories. + + +### Training Procedure + + + +The model was trained in sequence-to-sequence fashion. +Input includes source instruction, LLM's answer, name of criterion, its rubrics and reference answer if present. +The output is expected to be numerical score from provided rubrics and textual explanation. + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +For testing data we employed the [POLLUX dataset](https://huggingface.co/datasets/ai-forever/POLLUX). +Note this provides both in- and out-of-domain evaluation as some of the tasks and criteria are absent in training data. + +#### Metrics + + + +We employed **RMSE**, **macro F1**, and **Spearman’s rank correlation** with expert judgements to assess the performance of Pollux-4B-Judge and compare it with those of the reference models. + +RMSE offers a high degree of interpretability, as it is measured on the same scale as the annotation – specifically, in points. +Macro F1 captures score classification quality across labels, while Spearman’s rank correlation allows to quantify the degree of monotonic association between the two rankings of models outputs and +to demonstrate how consistently the LLM-as-Judge reproduces the relative ordering of output quality as established by human experts. + + +### Results + +We report aggregate evaluation results in the table below. + +| Модель | RMSE↓ | macro F1↑ | Корреляция Спирмена↑ | +| --- | --- | --- | --- | +| Pollux-4B-Judge | 0,568 | 0,705 | 0,744 | +| Pollux-7B-Judge-Base | 0,703 | 0,406 | 0,572 | +| Pollux-32B-Judge-Base | 0,700 | 0,546 | 0,578 | +| Qwen/Qwen3.5-4B | 0,722 | 0,436 | 0,582 | +| Qwen/Qwen3.5-35B-A3B | 0,620 | 0,473 | 0,664 | +| Qwen/Qwen3.5-122B-A10B | 0,613 | 0,475 | 0,669 | +| gpt-oss-120b | 0,654 | 0,462 | 0,635 | +| Minimax-M2.5 (229B) | 0,651 | 0,617 | 0,635 | +| Qwen/Qwen3.5-397B-A17B | 0,600 | 0,481 | 0,684 | +| Kimi-K2.5 (1.1T) | 0,612 | 0,478 | 0,673 | +| GLM-4.7 (358b) | 0,626 | 0,479 | 0,678 | +| Gemma4-31b | 0,632 | 0,479 | 0,680 | + + +## Citation + + + +**BibTeX:** + +``` +@misc{martynov2025eyejudgementdissectingevaluation, + title={Eye of Judgement: Dissecting the Evaluation of Russian-speaking LLMs with POLLUX}, + author={Nikita Martynov and Anastasia Mordasheva and Dmitriy Gorbetskiy and Danil Astafurov and Ulyana Isaeva and Elina Basyrova and Sergey Skachkov and Victoria Berestova and Nikolay Ivanov and Valeriia Zanina and Alena Fenogenova}, + year={2025}, + eprint={2505.24616}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2505.24616}, +} +``` diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..716648e --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.4", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..52bc634 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.5.4" +} diff --git a/logo_pollux_horiz_short_WHITEBG.png b/logo_pollux_horiz_short_WHITEBG.png new file mode 100644 index 0000000..0cb8cb8 Binary files /dev/null and b/logo_pollux_horiz_short_WHITEBG.png differ diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..38adffa --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e852052b5170ac440838cd3e18477b14f3004e3ae70346c2787917a69f4036 +size 8044982080 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..be8885e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": true, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +}