初始化项目,由ModelHub XC社区提供模型
Model: divelab/DAPO_E2H-countdown-gaussian_0p5_0p5 Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
89
.hydra/config.yaml
Normal file
89
.hydra/config.yaml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
mode: train
|
||||||
|
experiment:
|
||||||
|
dataset_size: 6000
|
||||||
|
dataset_seed: 1234
|
||||||
|
test_size: 0.1
|
||||||
|
hf_token: ${oc.env:HF_TOKEN,null}
|
||||||
|
output:
|
||||||
|
root_path: ${oc.env:ROOT_PATH}
|
||||||
|
run_name: ${model.trim}_${task.name}_${algorithm.name}_${algorithm.training.curriculum_schedule}_${algorithm.training.scheduler_params.mu_exp}_${algorithm.training.scheduler_params.sigma}_SEC${algorithm.training.scheduler_params.vrex_adds.sec}DRO${algorithm.training.scheduler_params.vrex_adds.groupdro}G${algorithm.training.scheduler_params.vrex_adds.gaussian}_minp${algorithm.training.scheduler_params.min_prob}${ckpt2short:${algorithm.training.resume_from_checkpoint}}_${algorithm.training.max_steps}
|
||||||
|
lora:
|
||||||
|
r: 32
|
||||||
|
alpha: 64
|
||||||
|
dropout: 0.1
|
||||||
|
target_modules:
|
||||||
|
- q_proj
|
||||||
|
- v_proj
|
||||||
|
task_type: CAUSAL_LM
|
||||||
|
occupy_gpu_memory: false
|
||||||
|
occupy_gpu_memory_gb: 50
|
||||||
|
gpu_device: cuda:0
|
||||||
|
model:
|
||||||
|
family: Qwen
|
||||||
|
trim: Qwen2.5-1.5B-Instruct
|
||||||
|
name: ${model.family}/${model.trim}
|
||||||
|
trust_remote_code: true
|
||||||
|
torch_dtype: bfloat16
|
||||||
|
attn_implementation: flash_attention_2
|
||||||
|
task:
|
||||||
|
name: countdown2345
|
||||||
|
data_files:
|
||||||
|
- citrinegui/countdown_n2t100_1-100
|
||||||
|
- citrinegui/countdown_n3t100_1-100
|
||||||
|
- citrinegui/countdown_n4t100_1-100
|
||||||
|
- citrinegui/countdown_n5t100_1-100
|
||||||
|
test_file: citrinegui/countdown_n6t100_1-100
|
||||||
|
force_redownload: false
|
||||||
|
train_size: 327680
|
||||||
|
test_size: 1024
|
||||||
|
training:
|
||||||
|
max_prompt_length: 1000
|
||||||
|
max_completion_length: 512
|
||||||
|
inference:
|
||||||
|
checkpoint: outputs/Qwen2.5-1.5B-Instruct_countdown2345_grpo_balanced_0.5_0.5_SEC0.3DRO1.0G0.0_minpTrue_1600/checkpoint-1600/
|
||||||
|
temperature: 0.0
|
||||||
|
sc_num: 1
|
||||||
|
pass_at_k: 1
|
||||||
|
resume: 0
|
||||||
|
max_new_tokens: 512
|
||||||
|
batch_size: 32
|
||||||
|
algorithm:
|
||||||
|
name: grpo
|
||||||
|
training:
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
learning_rate: 1.0e-06
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
logging_steps: 10
|
||||||
|
max_steps: 1600
|
||||||
|
per_device_train_batch_size: 16
|
||||||
|
generation_batch_size: null
|
||||||
|
steps_per_generation: 1
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
gradient_checkpointing: true
|
||||||
|
bf16: true
|
||||||
|
report_to:
|
||||||
|
- wandb
|
||||||
|
push_to_hub: true
|
||||||
|
save_strategy: steps
|
||||||
|
save_steps: ${algorithm.training.max_steps}
|
||||||
|
tf32: true
|
||||||
|
num_generations: 8
|
||||||
|
beta: 0.001
|
||||||
|
use_vllm: true
|
||||||
|
vllm_mode: colocate
|
||||||
|
vllm_gpu_memory_utilization: 0.3
|
||||||
|
vllm_server_port: 8000
|
||||||
|
curriculum: false
|
||||||
|
curriculum_schedule: gaussian
|
||||||
|
scheduler_params:
|
||||||
|
mu_exp: 0.5
|
||||||
|
sigma: 0.5
|
||||||
|
vrex_adds:
|
||||||
|
groupdro: 1.0
|
||||||
|
gaussian: 0.0
|
||||||
|
sec: 0.3
|
||||||
|
beta: 1.0
|
||||||
|
min_prob: true
|
||||||
|
td_alpha: 0.5
|
||||||
|
sec_temperature: 0.3
|
||||||
|
max_dapo_iter: 2
|
||||||
164
.hydra/hydra.yaml
Normal file
164
.hydra/hydra.yaml
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
hydra:
|
||||||
|
run:
|
||||||
|
dir: ${output.root_path}/outputs/${mode2name:${mode},${output.run_name},${model.trim}}
|
||||||
|
sweep:
|
||||||
|
dir: ${output.root_path}/multirun/${now:%Y%m%d}
|
||||||
|
subdir: ${hydra.job.override_dirname}
|
||||||
|
launcher:
|
||||||
|
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
||||||
|
sweeper:
|
||||||
|
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
||||||
|
max_batch_size: null
|
||||||
|
params: null
|
||||||
|
help:
|
||||||
|
app_name: ${hydra.job.name}
|
||||||
|
header: '${hydra.help.app_name} is powered by Hydra.
|
||||||
|
|
||||||
|
'
|
||||||
|
footer: 'Powered by Hydra (https://hydra.cc)
|
||||||
|
|
||||||
|
Use --hydra-help to view Hydra specific help
|
||||||
|
|
||||||
|
'
|
||||||
|
template: '${hydra.help.header}
|
||||||
|
|
||||||
|
== Configuration groups ==
|
||||||
|
|
||||||
|
Compose your configuration from those groups (group=option)
|
||||||
|
|
||||||
|
|
||||||
|
$APP_CONFIG_GROUPS
|
||||||
|
|
||||||
|
|
||||||
|
== Config ==
|
||||||
|
|
||||||
|
Override anything in the config (foo.bar=value)
|
||||||
|
|
||||||
|
|
||||||
|
$CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
${hydra.help.footer}
|
||||||
|
|
||||||
|
'
|
||||||
|
hydra_help:
|
||||||
|
template: 'Hydra (${hydra.runtime.version})
|
||||||
|
|
||||||
|
See https://hydra.cc for more info.
|
||||||
|
|
||||||
|
|
||||||
|
== Flags ==
|
||||||
|
|
||||||
|
$FLAGS_HELP
|
||||||
|
|
||||||
|
|
||||||
|
== Configuration groups ==
|
||||||
|
|
||||||
|
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
||||||
|
to command line)
|
||||||
|
|
||||||
|
|
||||||
|
$HYDRA_CONFIG_GROUPS
|
||||||
|
|
||||||
|
|
||||||
|
Use ''--cfg hydra'' to Show the Hydra config.
|
||||||
|
|
||||||
|
'
|
||||||
|
hydra_help: ???
|
||||||
|
hydra_logging:
|
||||||
|
version: 1
|
||||||
|
formatters:
|
||||||
|
simple:
|
||||||
|
format: '[%(asctime)s][HYDRA] %(message)s'
|
||||||
|
handlers:
|
||||||
|
console:
|
||||||
|
class: logging.StreamHandler
|
||||||
|
formatter: simple
|
||||||
|
stream: ext://sys.stdout
|
||||||
|
root:
|
||||||
|
level: INFO
|
||||||
|
handlers:
|
||||||
|
- console
|
||||||
|
loggers:
|
||||||
|
logging_example:
|
||||||
|
level: DEBUG
|
||||||
|
disable_existing_loggers: false
|
||||||
|
job_logging:
|
||||||
|
version: 1
|
||||||
|
formatters:
|
||||||
|
simple:
|
||||||
|
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
||||||
|
handlers:
|
||||||
|
console:
|
||||||
|
class: logging.StreamHandler
|
||||||
|
formatter: simple
|
||||||
|
stream: ext://sys.stdout
|
||||||
|
file:
|
||||||
|
class: logging.FileHandler
|
||||||
|
formatter: simple
|
||||||
|
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
||||||
|
root:
|
||||||
|
level: INFO
|
||||||
|
handlers:
|
||||||
|
- console
|
||||||
|
- file
|
||||||
|
disable_existing_loggers: false
|
||||||
|
env: {}
|
||||||
|
mode: RUN
|
||||||
|
searchpath: []
|
||||||
|
callbacks: {}
|
||||||
|
output_subdir: .hydra
|
||||||
|
overrides:
|
||||||
|
hydra:
|
||||||
|
- hydra.mode=RUN
|
||||||
|
task:
|
||||||
|
- mode=train
|
||||||
|
- task=countdown2345
|
||||||
|
- algorithm=grpo
|
||||||
|
- algorithm.training.curriculum_schedule=gaussian
|
||||||
|
- model=qwen15
|
||||||
|
- algorithm.training.max_steps=1600
|
||||||
|
- algorithm.training.vllm_mode=colocate
|
||||||
|
job:
|
||||||
|
name: main
|
||||||
|
chdir: false
|
||||||
|
override_dirname: algorithm.training.curriculum_schedule=gaussian,algorithm.training.max_steps=1600,algorithm.training.vllm_mode=colocate,algorithm=grpo,mode=train,model=qwen15,task=countdown2345
|
||||||
|
id: ???
|
||||||
|
num: ???
|
||||||
|
config_name: config
|
||||||
|
env_set: {}
|
||||||
|
env_copy: []
|
||||||
|
config:
|
||||||
|
override_dirname:
|
||||||
|
kv_sep: '='
|
||||||
|
item_sep: ','
|
||||||
|
exclude_keys: []
|
||||||
|
runtime:
|
||||||
|
version: 1.3.2
|
||||||
|
version_base: '1.3'
|
||||||
|
cwd: /mnt/data/shared/shparashar/Sys2Bench
|
||||||
|
config_sources:
|
||||||
|
- path: hydra.conf
|
||||||
|
schema: pkg
|
||||||
|
provider: hydra
|
||||||
|
- path: /mnt/data/shared/shparashar/Sys2Bench/methods/RL/conf
|
||||||
|
schema: file
|
||||||
|
provider: main
|
||||||
|
- path: ''
|
||||||
|
schema: structured
|
||||||
|
provider: schema
|
||||||
|
output_dir: /mnt/data/shared/shparashar/Sys2Bench/outputs/Qwen2.5-1.5B-Instruct_countdown2345_grpo_gaussian_0.5_0.5_SEC0.3DRO1.0G0.0_minpTrue_1600
|
||||||
|
choices:
|
||||||
|
algorithm: grpo
|
||||||
|
task: countdown2345
|
||||||
|
model: qwen15
|
||||||
|
hydra/env: default
|
||||||
|
hydra/callbacks: null
|
||||||
|
hydra/job_logging: default
|
||||||
|
hydra/hydra_logging: default
|
||||||
|
hydra/hydra_help: default
|
||||||
|
hydra/help: default
|
||||||
|
hydra/sweeper: basic
|
||||||
|
hydra/launcher: basic
|
||||||
|
hydra/output: default
|
||||||
|
verbose: false
|
||||||
7
.hydra/overrides.yaml
Normal file
7
.hydra/overrides.yaml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
- mode=train
|
||||||
|
- task=countdown2345
|
||||||
|
- algorithm=grpo
|
||||||
|
- algorithm.training.curriculum_schedule=gaussian
|
||||||
|
- model=qwen15
|
||||||
|
- algorithm.training.max_steps=1600
|
||||||
|
- algorithm.training.vllm_mode=colocate
|
||||||
56
README.md
Normal file
56
README.md
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
---
|
||||||
|
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||||
|
datasets: gsm8k-dataset
|
||||||
|
library_name: transformers
|
||||||
|
model_name: Qwen2.5-1.5B-Instruct_math_grpo_cosine_0.5_0.5_SEC0.3DRO1.0G0.0_minpTrue_1600
|
||||||
|
tags:
|
||||||
|
- generated_from_trainer
|
||||||
|
- trl
|
||||||
|
- grpo
|
||||||
|
licence: license
|
||||||
|
---
|
||||||
|
|
||||||
|
# Model Card for Qwen2.5-1.5B-Instruct_math_grpo_cosine_0.5_0.5_SEC0.3DRO1.0G0.0_minpTrue_1600
|
||||||
|
|
||||||
|
This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the Countdown dataset.
|
||||||
|
It has been trained using [E2H](https://github.com/divelab/E2H-Reasoning) on the top of [TRL](https://github.com/huggingface/trl).
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import pipeline
|
||||||
|
|
||||||
|
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
||||||
|
generator = pipeline("text-generation", model="shubhamprshr/Qwen2.5-1.5B-Instruct_math_grpo_cosine_0.5_0.5_SEC0.3DRO1.0G0.0_minpTrue_1600", device="cuda")
|
||||||
|
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
||||||
|
print(output["generated_text"])
|
||||||
|
```
|
||||||
|
|
||||||
|
## Training procedure
|
||||||
|
|
||||||
|
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shubhamprshr27-tamu/dapo_e2h/runs/upy1drqf)
|
||||||
|
|
||||||
|
|
||||||
|
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
||||||
|
|
||||||
|
### Framework versions
|
||||||
|
|
||||||
|
- TRL: 0.19.1
|
||||||
|
- Transformers: 4.53.1
|
||||||
|
- Pytorch: 2.7.0
|
||||||
|
- Datasets: 3.6.0
|
||||||
|
- Tokenizers: 0.21.4
|
||||||
|
|
||||||
|
## Citations
|
||||||
|
|
||||||
|
Cite E2H as:
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@inproceedings{parashar2026curriculum,
|
||||||
|
title = {Curriculum Reinforcement Learning from Easy to Hard Tasks Improves {LLM} Reasoning},
|
||||||
|
author = {Parashar, Shubham and Gui, Shurui and Li, Xiner and Ling, Hongyi and Vemuri, Sushil and Olson, Blake and Li, Eric and Zhang, Yu and Caverlee, James and Kalathil, Dileep and Ji, Shuiwang},
|
||||||
|
booktitle = {The Fourteenth International Conference on Learning Representations},
|
||||||
|
year = {2026},
|
||||||
|
url = {https://openreview.net/forum?id=KJvHnl3kUv}
|
||||||
|
}
|
||||||
|
```
|
||||||
24
added_tokens.json
Normal file
24
added_tokens.json
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"</tool_call>": 151658,
|
||||||
|
"<tool_call>": 151657,
|
||||||
|
"<|box_end|>": 151649,
|
||||||
|
"<|box_start|>": 151648,
|
||||||
|
"<|endoftext|>": 151643,
|
||||||
|
"<|file_sep|>": 151664,
|
||||||
|
"<|fim_middle|>": 151660,
|
||||||
|
"<|fim_pad|>": 151662,
|
||||||
|
"<|fim_prefix|>": 151659,
|
||||||
|
"<|fim_suffix|>": 151661,
|
||||||
|
"<|im_end|>": 151645,
|
||||||
|
"<|im_start|>": 151644,
|
||||||
|
"<|image_pad|>": 151655,
|
||||||
|
"<|object_ref_end|>": 151647,
|
||||||
|
"<|object_ref_start|>": 151646,
|
||||||
|
"<|quad_end|>": 151651,
|
||||||
|
"<|quad_start|>": 151650,
|
||||||
|
"<|repo_name|>": 151663,
|
||||||
|
"<|video_pad|>": 151656,
|
||||||
|
"<|vision_end|>": 151653,
|
||||||
|
"<|vision_pad|>": 151654,
|
||||||
|
"<|vision_start|>": 151652
|
||||||
|
}
|
||||||
54
chat_template.jinja
Normal file
54
chat_template.jinja
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
{%- if tools %}
|
||||||
|
{{- '<|im_start|>system\n' }}
|
||||||
|
{%- if messages[0]['role'] == 'system' %}
|
||||||
|
{{- messages[0]['content'] }}
|
||||||
|
{%- else %}
|
||||||
|
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{{- tool | tojson }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if messages[0]['role'] == 'system' %}
|
||||||
|
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{{- '<|im_start|>' + message.role }}
|
||||||
|
{%- if message.content %}
|
||||||
|
{{- '\n' + message.content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if tool_call.function is defined %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_call>\n{"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '", "arguments": ' }}
|
||||||
|
{{- tool_call.arguments | tojson }}
|
||||||
|
{{- '}\n</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
||||||
|
{{- '<|im_start|>user' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_response>\n' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{{- '\n</tool_response>' }}
|
||||||
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- endif %}
|
||||||
58
config.json
Normal file
58
config.json
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"Qwen2ForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 151643,
|
||||||
|
"eos_token_id": 151645,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 1536,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 8960,
|
||||||
|
"layer_types": [
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention"
|
||||||
|
],
|
||||||
|
"max_position_embeddings": 32768,
|
||||||
|
"max_window_layers": 21,
|
||||||
|
"model_type": "qwen2",
|
||||||
|
"num_attention_heads": 12,
|
||||||
|
"num_hidden_layers": 28,
|
||||||
|
"num_key_value_heads": 2,
|
||||||
|
"rms_norm_eps": 1e-06,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 1000000.0,
|
||||||
|
"sliding_window": null,
|
||||||
|
"tie_word_embeddings": true,
|
||||||
|
"torch_dtype": "bfloat16",
|
||||||
|
"transformers_version": "4.53.1",
|
||||||
|
"use_cache": false,
|
||||||
|
"use_sliding_window": false,
|
||||||
|
"vocab_size": 151936
|
||||||
|
}
|
||||||
14
generation_config.json
Normal file
14
generation_config.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"bos_token_id": 151643,
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": [
|
||||||
|
151645,
|
||||||
|
151643
|
||||||
|
],
|
||||||
|
"pad_token_id": 151643,
|
||||||
|
"repetition_penalty": 1.1,
|
||||||
|
"temperature": 0.7,
|
||||||
|
"top_k": 20,
|
||||||
|
"top_p": 0.8,
|
||||||
|
"transformers_version": "4.53.1"
|
||||||
|
}
|
||||||
151388
merges.txt
Normal file
151388
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:866f11b2754e87780fbd0b2e9e673e754fff1c7759f9f57e78ed7600de3c49bb
|
||||||
|
size 3087467144
|
||||||
31
special_tokens_map.json
Normal file
31
special_tokens_map.json
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|object_ref_start|>",
|
||||||
|
"<|object_ref_end|>",
|
||||||
|
"<|box_start|>",
|
||||||
|
"<|box_end|>",
|
||||||
|
"<|quad_start|>",
|
||||||
|
"<|quad_end|>",
|
||||||
|
"<|vision_start|>",
|
||||||
|
"<|vision_end|>",
|
||||||
|
"<|vision_pad|>",
|
||||||
|
"<|image_pad|>",
|
||||||
|
"<|video_pad|>"
|
||||||
|
],
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
|
||||||
|
size 11422063
|
||||||
207
tokenizer_config.json
Normal file
207
tokenizer_config.json
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
{
|
||||||
|
"add_bos_token": false,
|
||||||
|
"add_prefix_space": false,
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"151643": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151644": {
|
||||||
|
"content": "<|im_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151645": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151646": {
|
||||||
|
"content": "<|object_ref_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151647": {
|
||||||
|
"content": "<|object_ref_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151648": {
|
||||||
|
"content": "<|box_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151649": {
|
||||||
|
"content": "<|box_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151650": {
|
||||||
|
"content": "<|quad_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151651": {
|
||||||
|
"content": "<|quad_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151652": {
|
||||||
|
"content": "<|vision_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151653": {
|
||||||
|
"content": "<|vision_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151654": {
|
||||||
|
"content": "<|vision_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151655": {
|
||||||
|
"content": "<|image_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151656": {
|
||||||
|
"content": "<|video_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151657": {
|
||||||
|
"content": "<tool_call>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151658": {
|
||||||
|
"content": "</tool_call>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151659": {
|
||||||
|
"content": "<|fim_prefix|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151660": {
|
||||||
|
"content": "<|fim_middle|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151661": {
|
||||||
|
"content": "<|fim_suffix|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151662": {
|
||||||
|
"content": "<|fim_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151663": {
|
||||||
|
"content": "<|repo_name|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151664": {
|
||||||
|
"content": "<|file_sep|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|object_ref_start|>",
|
||||||
|
"<|object_ref_end|>",
|
||||||
|
"<|box_start|>",
|
||||||
|
"<|box_end|>",
|
||||||
|
"<|quad_start|>",
|
||||||
|
"<|quad_end|>",
|
||||||
|
"<|vision_start|>",
|
||||||
|
"<|vision_end|>",
|
||||||
|
"<|vision_pad|>",
|
||||||
|
"<|image_pad|>",
|
||||||
|
"<|video_pad|>"
|
||||||
|
],
|
||||||
|
"bos_token": null,
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "<|im_end|>",
|
||||||
|
"errors": "replace",
|
||||||
|
"extra_special_tokens": {},
|
||||||
|
"model_max_length": 131072,
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "Qwen2Tokenizer",
|
||||||
|
"unk_token": null
|
||||||
|
}
|
||||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user