Sync from upstream llama.cpp repository

2026-01-16 10:43:34 +08:00
parent 3bc369a6f7
commit f4ae4cc7da
2053 changed files with 956010 additions and 1 deletions
--- a/tests/peg-parser/simple-tokenize.cpp
+++ b/tests/peg-parser/simple-tokenize.cpp
@@ -0,0 +1,37 @@
+#include "simple-tokenize.h"
+
+std::vector<std::string> simple_tokenize(const std::string & input) {
+    std::vector<std::string> result;
+    std::string              current;
+
+    for (size_t i = 0; i < input.size(); i++) {
+        switch (input[i]) {
+            case ' ':
+            case '\n':
+            case '\t':
+            case '{':
+            case '}':
+            case ',':
+            case '[':
+            case '"':
+            case ']':
+            case '.':
+            case '<':
+            case '>':
+            case '=':
+            case '/':
+                if (!current.empty()) {
+                    result.push_back(current);
+                    current.clear();
+                }
+            default:;
+        }
+        current += input[i];
+    }
+
+    if (!current.empty()) {
+        result.push_back(current);
+    }
+
+    return result;
+}
--- a/tests/peg-parser/simple-tokenize.h
+++ b/tests/peg-parser/simple-tokenize.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> simple_tokenize(const std::string &);
--- a/tests/peg-parser/test-basic.cpp
+++ b/tests/peg-parser/test-basic.cpp
@@ -0,0 +1,454 @@
+#include "tests.h"
+
+void test_basic(testing & t) {
+    t.test("chars", [](testing & t) {
+        // Test common escape sequences - newline
+        t.test("escape_sequence_newline", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\n");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_newline", true, result.success());
+        });
+
+        // Test common escape sequences - tab
+        t.test("escape_sequence_tab", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\t");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_tab", true, result.success());
+        });
+
+        // Test common escape sequences - backslash
+        t.test("escape_sequence_backslash", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\\");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_backslash", true, result.success());
+        });
+
+        // Test common escape sequences - space (should ())
+        t.test("escape_sequence_space_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context(" ");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_space_fail", true, result.fail());
+        });
+
+        // Test escaped dash - 'a' should succeed
+        t.test("escaped_dash_a", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_a", true, result.success());
+        });
+
+        // Test escaped dash - '-' should succeed (literal dash)
+        t.test("escaped_dash_literal", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_literal", true, result.success());
+        });
+
+        // Test escaped dash - 'z' should succeed
+        t.test("escaped_dash_z", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("z");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_z", true, result.success());
+        });
+
+        // Test escaped dash - 'b' should NOT match (since \- is literal dash, not range)
+        t.test("escaped_dash_b_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("b");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_b_fail", true, result.fail());
+        });
+    });
+
+
+    t.test("optional", [](testing & t) {
+        // Full match with optional part present
+        t.test("optional_present", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello world");
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_present", true, result.success());
+            t.assert_equal("optional_present_end", 11u, result.end);
+        });
+
+        // Full match with optional part absent
+        t.test("optional_absent", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_absent", true, result.success());
+            t.assert_equal("optional_absent_end", 5u, result.end);
+        });
+
+        // Partial match - waiting for more input to determine if optional matches
+        t.test("partial_match_need_more", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello ", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("partial_match_need_more", true, result.need_more_input());
+        });
+    });
+
+    t.test("partial parsing", [](testing & t) {
+        // Literals - Basic Success
+        t.test("literal_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("hello");
+            result = parser.parse(ctx);
+            t.assert_equal("literal_success", true, result.success());
+        });
+
+        // Char Classes - Basic Lowercase Success
+        t.test("char_class_lowercase_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_lowercase_success", true, result.success());
+        });
+
+        // Char Classes - Uppercase Fail
+        t.test("char_class_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_uppercase_fail", true, result.fail());
+        });
+
+        // Char Classes with Dash - Lowercase Success
+        t.test("char_class_with_dash_lowercase", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("f");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_lowercase", true, result.success());
+        });
+
+        // Char Classes with Dash - Literal Dash Success
+        t.test("char_class_with_dash_literal_dash", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_literal_dash", true, result.success());
+        });
+
+        // Char Classes with Dash - Uppercase Fail
+        t.test("char_class_with_dash_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail());
+        });
+
+        // Sequences - Partial Match 1
+        t.test("sequence_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<thi", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_1", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 2
+        t.test("sequence_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); });
+
+            auto ctx    = common_peg_parse_context("begin", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_2", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 3
+        t.test("sequence_partial_match_3", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think></", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_3", true, result.need_more_input());
+        });
+
+        // Sequences - Full Match
+        t.test("sequence_full_match", [&](testing & t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); });
+
+            auto ctx    = common_peg_parse_context("helloworld", false);
+            auto result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("sequence_full_match", true, result.success());
+        });
+
+        // Sequences - No Match
+        t.test("sequence_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think>I am common_chat_combinator_parser", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_no_match", true, result.fail());
+        });
+
+        // Choices - Partial Match 1
+        t.test("choices_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); });
+
+            auto ctx    = common_peg_parse_context("opt", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_1", true, result.need_more_input());
+        });
+
+        // Choices - Partial Match 2
+        t.test("choices_partial_match_2", [&](testing & t) {
+            auto parser =
+                build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); });
+
+            auto ctx    = common_peg_parse_context("choice", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_2", true, result.need_more_input());
+        });
+
+        // Choices - Full Match 1
+        t.test("choices_full_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); });
+
+            auto ctx    = common_peg_parse_context("first", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_1", true, result.success());
+        });
+
+        // Choices - Full Match 2
+        t.test("choices_full_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); });
+
+            auto ctx    = common_peg_parse_context("beta", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_2", true, result.success());
+        });
+
+        // Choices - No Match
+        t.test("choices_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); });
+
+            auto ctx    = common_peg_parse_context("best", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_no_match", true, result.fail());
+        });
+
+        // Zero or More - Partial Match 1
+        t.test("zero_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("a", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // Zero or More - Partial Match 2
+        t.test("zero_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); });
+
+            auto ctx    = common_peg_parse_context("xyx", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // Zero or More - Full Match
+        t.test("zero_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); });
+
+            auto ctx    = common_peg_parse_context("test", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_full_match", true, result.success());
+        });
+
+        // One or More - Partial Match 1
+        t.test("one_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); });
+
+            auto ctx    = common_peg_parse_context("rep", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // One or More - Partial Match 2
+        t.test("one_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("aba", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // One or More - Full Match
+        t.test("one_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); });
+
+            auto ctx    = common_peg_parse_context("single", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_full_match", true, result.success());
+        });
+
+        // One or More - No Match
+        t.test("one_or_more_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); });
+
+            auto ctx    = common_peg_parse_context("success", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_no_match", true, result.fail());
+        });
+    });
+
+
+    t.test("recursive rules", [](testing &t) {
+        // Test simple number
+        t.test("simple_number", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("1", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test simple list
+        t.test("simple_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[1]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test nested list
+        t.test("nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[2]]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test deeply nested list
+        t.test("deeply_nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[[3]]]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test need_more_input match
+        t.test("need_more_input_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[", true);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+        });
+
+        // Test no match
+        t.test("no_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[a]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_fail", true, result.fail());
+        });
+    });
+}
--- a/tests/peg-parser/test-gbnf-generation.cpp
+++ b/tests/peg-parser/test-gbnf-generation.cpp
@@ -0,0 +1,250 @@
+#include "tests.h"
+
+#include "json-schema-to-grammar.h"
+
+#include <regex>
+
+static std::string trim_leading_space(const std::string & s) {
+    static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)");
+    return std::regex_replace(s, leading_ws_re, "$1");
+}
+
+static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) {
+    t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual));
+}
+
+void test_gbnf_generation(testing &t) {
+    t.test("literal grammar generation", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("char class grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.chars("[a-z]", 1, 1);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= [a-z]
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("sequence grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.literal(" ") + p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " " "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("choice grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("cat") | p.literal("dog");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "cat" | "dog"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("one_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("zero_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.zero_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("optional grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.optional(p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " world"?
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("until grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until("</tag>");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("complex expressions with parentheses", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a") | p.literal("b"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ("a" | "b")+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("rule references", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto digit = p.rule("digit", p.chars("[0-9]", 1, 1));
+            return p.one_or_more(digit);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            digit ::= [0-9]
+            root ::= digit+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("escaping in literals", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello\nworld\n!");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello\nworld\n!"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("operator<< (whitespace insertion)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") << p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" space "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only reachable rules", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            p.rule("orphan", p.literal("orphan"));
+            return p.literal("hello") + p.rule("child", p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            child ::= " world"
+            root ::= "hello" child
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only trigger rules (and references)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
+            p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true);
+            p.rule("rule-3", p.literal("c") + p.ref("rule-4"));
+            p.rule("rule-4", p.literal("d"), true);
+            return rule1;
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-1
+            rule-1 ::= "a" rule-2
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+
+        auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder, true);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-2 | rule-4
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf_lazy);
+    });
+}
--- a/tests/peg-parser/test-json-parser.cpp
+++ b/tests/peg-parser/test-json-parser.cpp
@@ -0,0 +1,109 @@
+#include "tests.h"
+
+void test_json_parser(testing &t) {
+    // Test parsing a simple JSON object
+    t.test("simple JSON object parsing", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": 42, "flag": true})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing a JSON array with mixed types
+    t.test("JSON array with mixed types", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, "hello", true, null, 3.14])";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing nested JSON with objects and arrays
+    t.test("nested JSON with objects and arrays", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string input =
+            R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test need_more_input() parsing - incomplete object
+    t.test("need_more_input() parsing - incomplete object", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete array
+    t.test("need_more_input() parsing - incomplete array", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, 2, 3, )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete nested structure
+    t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"data": {"nested": )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    t.test("object member", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.json_member("name", "\"" + p.chars("[a-z]") + "\"");
+        });
+
+        t.test("success", [&](testing &t) {
+            std::string input = R"("name": "bob")";
+            common_peg_parse_context ctx(input, false);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+        });
+
+        t.test("partial", [&](testing &t) {
+            std::string input = R"("name": "bo)";
+            common_peg_parse_context ctx(input, true);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("need more input", result.need_more_input());
+        });
+
+        t.test("failed", [&](testing &t) {
+            std::string input = R"([])";
+            common_peg_parse_context ctx(input, false);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("fail", result.fail());
+        });
+    });
+}
--- a/tests/peg-parser/test-json-serialization.cpp
+++ b/tests/peg-parser/test-json-serialization.cpp
@@ -0,0 +1,28 @@
+#include "tests.h"
+
+void test_json_serialization(testing &t) {
+    auto original = build_peg_parser([](common_peg_parser_builder & p) {
+        return "<tool_call>" + p.json() + "</tool_call>";
+    });
+
+    auto json_serialized = original.to_json().dump();
+
+    t.test("compare before/after", [&](testing &t) {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+
+        // Test complex JSON
+        std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})";
+        common_peg_parse_context ctx1(input);
+        common_peg_parse_context ctx2(input);
+
+        auto result1 = original.parse(ctx1);
+        auto result2 = deserialized.parse(ctx2);
+
+        t.assert_equal("both_succeed", result1.success(), result2.success());
+        t.assert_equal("same_end_pos", result1.end, result2.end);
+    });
+
+    t.bench("deserialize", [&]() {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+    }, 100);
+}
--- a/tests/peg-parser/test-unicode.cpp
+++ b/tests/peg-parser/test-unicode.cpp
@@ -0,0 +1,449 @@
+#include "tests.h"
+
+#include "peg-parser.h"
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <cctype>
+
+static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
+    t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
+}
+
+static std::string hex_dump(const std::string& str) {
+    std::ostringstream oss;
+    for (unsigned char c : str) {
+        if (std::isprint(c)) {
+            oss << c;
+        } else {
+            oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+        }
+    }
+    return oss.str();
+}
+
+void test_unicode(testing &t) {
+    struct test_case {
+        std::string input;
+        std::string expected_text;
+        common_peg_parse_result_type expected_result;
+    };
+
+    t.test("any", [](testing &t) {
+        std::vector<test_case> test_cases {
+            // Valid UTF-8 sequences
+            {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+            // Incomplete UTF-8 sequences (partial bytes at end)
+            {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+            // Invalid/malformed UTF-8 sequences
+            {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+        };
+
+        auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+            return p.sequence({p.one_or_more(p.any()), p.end()});
+        });
+
+        for (size_t i = 0; i < test_cases.size(); i++) {
+            const auto & tc = test_cases[i];
+            std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+            t.test(test_name, [&](testing &t) {
+                common_peg_parse_context ctx(tc.input, true);
+                auto result = parser.parse(ctx);
+
+                // Assert result type matches
+                assert_result_equal(t, tc.expected_result, result.type);
+
+                // Assert matched text if success or need_more_input
+                if (result.success() || result.need_more_input()) {
+                    std::string matched = tc.input.substr(result.start, result.end - result.start);
+                    t.assert_equal(tc.expected_text, matched);
+                }
+            });
+        }
+    });
+
+    t.test("char classes", [](testing &t) {
+        t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - CJK Unified Ideographs
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+                {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
+                {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
+
+                // Outside range - should fail
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},                                                     // ASCII
+                {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+4DFF (before range)
+                {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+A000 (after range)
+
+                // Incomplete sequences in range
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+4E00
+                {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+597D
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - Emoticons (all 4-byte UTF-8)
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+                {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
+                {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
+
+                // Outside range
+                {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
+                {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
+
+                // Incomplete sequences
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
+                {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},     // Very incomplete
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("mixed unicode ranges", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Match CJK
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+
+                // Match emoticons
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+
+                // Match ASCII digits
+                {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Don't match outside any range
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
+
+                // Incomplete
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+
+    t.test("until parser", [](testing &t) {
+        t.test("ASCII delimiter with Unicode content", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // CJK characters before delimiter
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Emoji before delimiter
+                {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8 at end", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete emoji at end, no delimiter
+                {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete CJK at end, no delimiter
+                {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Complete content, no delimiter (should consume all valid UTF-8)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+    });
+
+    t.test("json_string parser", [](testing &t) {
+        t.test("valid UTF-8 characters", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // ASCII only
+                {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 2-byte UTF-8 (accented characters)
+                {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 3-byte UTF-8 (CJK)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 4-byte UTF-8 (emoji)
+                {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.json_string_content(), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete 2-byte sequence
+                {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 3-byte sequence
+                {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 4-byte sequence
+                {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete at very start
+                {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.json_string_content();
+                    });
+
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Overlong encoding (security issue)
+                {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.json_string_content();
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+
+        t.test("escape sequences with UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Unicode escape sequence
+                {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mix of UTF-8 and escape sequences
+                {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Escaped quote in UTF-8 string
+                {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.json_string_content(), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+}
--- a/tests/peg-parser/testing.h
+++ b/tests/peg-parser/testing.h
@@ -0,0 +1,243 @@
+#pragma once
+
+#include "common.h"
+
+#include <chrono>
+#include <exception>
+#include <iostream>
+#include <string>
+#include <regex>
+#include <vector>
+
+struct testing {
+    std::ostream &out;
+    std::vector<std::string> stack;
+    std::regex filter;
+    bool filter_tests = false;
+    bool throw_exception = false;
+    bool verbose = false;
+    int tests = 0;
+    int assertions = 0;
+    int failures = 0;
+    int unnamed = 0;
+    int exceptions = 0;
+
+    static constexpr std::size_t status_column = 80;
+
+    explicit testing(std::ostream &os = std::cout) : out(os) {}
+
+    std::string indent() const {
+        if (stack.empty()) {
+            return "";
+        }
+        return std::string((stack.size() - 1) * 2, ' ');
+    }
+
+    std::string full_name() const {
+        return string_join(stack, ".");
+    }
+
+    void log(const std::string & msg) {
+        if (verbose) {
+            out << indent() << "  " << msg << "\n";
+        }
+    }
+
+    void set_filter(const std::string & re) {
+        filter = std::regex(re);
+        filter_tests = true;
+    }
+
+    bool should_run() const {
+        if (filter_tests) {
+            if (!std::regex_match(full_name(), filter)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <typename F>
+    void run_with_exceptions(F &&f, const char *ctx) {
+        try {
+            f();
+        } catch (const std::exception &e) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): " << e.what() << "\n";
+            if (throw_exception) {
+                throw;
+            }
+        } catch (...) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): unknown\n";
+            if (throw_exception) {
+                throw;
+            }
+        }
+    }
+
+    void print_result(const std::string &label, int new_failures, int new_assertions, const std::string &extra = "") const {
+        std::string line = indent() + label;
+
+        std::string details;
+        if (new_assertions > 0) {
+            if (new_failures == 0) {
+                details = std::to_string(new_assertions) + " assertion(s)";
+            } else {
+                details = std::to_string(new_failures) + " of " +
+                          std::to_string(new_assertions) + " assertion(s) failed";
+            }
+        }
+        if (!extra.empty()) {
+            if (!details.empty()) {
+                details += ", ";
+            }
+            details += extra;
+        }
+
+        if (!details.empty()) {
+            line += " (" + details + ")";
+        }
+
+        std::string status = (new_failures == 0) ? "[PASS]" : "[FAIL]";
+
+        if (line.size() + 1 < status_column) {
+            line.append(status_column - line.size(), ' ');
+        } else {
+            line.push_back(' ');
+        }
+
+        out << line << status << "\n";
+    }
+
+    template <typename F>
+    void test(const std::string &name, F f) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        run_with_exceptions([&] { f(*this); }, "test");
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        print_result(name, new_failures, new_assertions);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void test(F f) {
+        test("test #" + std::to_string(++unnamed), f);
+    }
+
+    template <typename F>
+    void bench(const std::string &name, F f, int iterations = 100) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << "[bench] " << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        using clock = std::chrono::high_resolution_clock;
+
+        std::chrono::microseconds duration(0);
+
+        run_with_exceptions([&] {
+            for (auto i = 0; i < iterations; i++) {
+                auto start = clock::now();
+                f();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start);
+            }
+        }, "bench");
+
+        auto avg_elapsed   = duration.count() / iterations;
+        auto avg_elapsed_s = std::chrono::duration_cast<std::chrono::duration<double>>(duration).count() / iterations;
+        auto rate = (avg_elapsed_s > 0.0) ? (1.0 / avg_elapsed_s) : 0.0;
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        std::string extra =
+            "n=" + std::to_string(iterations) +
+            " avg=" + std::to_string(avg_elapsed) + "us" +
+            " rate=" + std::to_string(int(rate)) + "/s";
+
+        print_result("[bench] " + name, new_failures, new_assertions, extra);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void bench(F f, int iterations = 100) {
+        bench("bench #" + std::to_string(++unnamed), f, iterations);
+    }
+
+    // Assertions
+    bool assert_true(bool cond) {
+        return assert_true("", cond);
+    }
+
+    bool assert_true(const std::string &msg, bool cond) {
+        ++assertions;
+        if (!cond) {
+            ++failures;
+            out << indent() << "ASSERT TRUE FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const A &expected, const B &actual) {
+        return assert_equal("", expected, actual);
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const std::string &msg, const A &expected, const B &actual) {
+        ++assertions;
+        if (!(actual == expected)) {
+            ++failures;
+            out << indent() << "ASSERT EQUAL FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+
+            out << indent() << "  expected: " << expected << "\n";
+            out << indent() << "  actual  : " << actual << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    // Print summary and return an exit code
+    int summary() const {
+        out << "\n";
+        out << "tests      : " << tests << "\n";
+        out << "assertions : " << assertions << "\n";
+        out << "failures   : " << failures << "\n";
+        out << "exceptions : " << exceptions << "\n";
+        return failures == 0 ? 0 : 1;
+    }
+};
--- a/tests/peg-parser/tests.h
+++ b/tests/peg-parser/tests.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// Common includes for all test files
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+#include "testing.h"
+#include "peg-parser.h"
+#include "chat-peg-parser.h"
+#include "simple-tokenize.h"
+
+struct bench_tool_call {
+    std::string            id;
+    std::string            name;
+    nlohmann::ordered_json args;
+};
+
+// Test function declarations
+void test_basic(testing &t);
+void test_json_parser(testing &t);
+void test_gbnf_generation(testing &t);
+void test_unicode(testing &t);
+void test_json_serialization(testing &t);