LongTraceRL-4B is a 4-billion parameter reasoning model trained with reinforcement learning on long-context multi-hop QA tasks using trajectory-based tiered distractors and entity-level rubric rewards.
@misc{lin2026longtracerllearninglongcontextreasoning,title={LongTraceRL: Learning Long-Context Reasoning from Search Agent Trajectories with Rubric Rewards},author={Nianyi Lin and Jiajie Zhang and Lei Hou and Juanzi Li},year={2026},eprint={2605.31584},archivePrefix={arXiv},primaryClass={cs.CL},url={https://arxiv.org/abs/2605.31584},}