sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
@pytest.mark.parametrize(
    "n_batch,batch_count,reuse_cache",
    [
-        (64, 4, False),
-        (64, 2, True),
+        (64, 3, False),
+        (64, 1, True),
    ]
 )
 def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,18 +462,10 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
    res = make_cmpl_request()
    last_progress = None
    total_batch_count = 0
-
    for data in res:
        cur_progress = data.get("prompt_progress", None)
        if cur_progress is None:
            continue
-        if total_batch_count == 0:
-            # first progress report must have n_cache == n_processed
-            assert cur_progress["total"] > 0
-            assert cur_progress["cache"] == cur_progress["processed"]
-            if reuse_cache:
-                # when reusing cache, we expect some cached tokens
-                assert cur_progress["cache"] > 0
        if last_progress is not None:
            assert cur_progress["total"] == last_progress["total"]
            assert cur_progress["cache"] == last_progress["cache"]
@@ -481,7 +473,6 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
        total_batch_count += 1
        last_progress = cur_progress

-    # last progress should indicate completion (all tokens processed)
    assert last_progress is not None
    assert last_progress["total"] > 0
    assert last_progress["processed"] == last_progress["total"]
@@ -491,22 +482,17 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
 def test_chat_completions_multiple_choices():
    global server
    server.start()
-    # make sure cache can be reused across multiple choices and multiple requests
-    # ref: https://github.com/ggml-org/llama.cpp/pull/18663
-    for _ in range(2):
-        res = server.make_request("POST", "/chat/completions", data={
-            "max_tokens": 8,
-            "n": 2,
-            "messages": [
-                {"role": "system", "content": "Book"},
-                {"role": "user", "content": "What is the best book"},
-            ],
-            # test forcing the same slot to be used
-            # the scheduler should not be locked up in this case
-            "id_slot": 0,
-        })
-        assert res.status_code == 200
-        assert len(res.body["choices"]) == 2
-        for choice in res.body["choices"]:
-            assert "assistant" == choice["message"]["role"]
-            assert choice["finish_reason"] == "length"
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "n": 2,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body["choices"]) == 2
+    for choice in res.body["choices"]:
+        assert "assistant" == choice["message"]["role"]
+        assert match_regex("Suddenly", choice["message"]["content"])
+        assert choice["finish_reason"] == "length"
--- a/tools/server/tests/unit/test_compat_anthropic.py
+++ b/tools/server/tests/unit/test_compat_anthropic.py
@@ -805,92 +805,3 @@ def test_anthropic_vs_openai_different_response_format():
    assert "input_tokens" in anthropic_res.body["usage"]
    assert "completion_tokens" in openai_res.body["usage"]
    assert "output_tokens" in anthropic_res.body["usage"]
-
-
-# Extended thinking tests with reasoning models
-
-@pytest.mark.slow
-@pytest.mark.parametrize("stream", [False, True])
-def test_anthropic_thinking_with_reasoning_model(stream):
-    """Test that thinking content blocks are properly returned for reasoning models"""
-    global server
-    server = ServerProcess()
-    server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
-    server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
-    server.reasoning_format = "deepseek"
-    server.jinja = True
-    server.n_ctx = 8192
-    server.n_predict = 1024
-    server.server_port = 8084
-    server.start(timeout_seconds=600)  # large model needs time to download
-
-    if stream:
-        res = server.make_stream_request("POST", "/v1/messages", data={
-            "model": "test",
-            "max_tokens": 1024,
-            "thinking": {
-                "type": "enabled",
-                "budget_tokens": 500
-            },
-            "messages": [
-                {"role": "user", "content": "What is 2+2?"}
-            ],
-            "stream": True
-        })
-
-        events = list(res)
-
-        # should have thinking content block events
-        thinking_starts = [e for e in events if
-            e.get("type") == "content_block_start" and
-            e.get("content_block", {}).get("type") == "thinking"]
-        assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
-        assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
-
-        # should have thinking_delta events
-        thinking_deltas = [e for e in events if
-            e.get("type") == "content_block_delta" and
-            e.get("delta", {}).get("type") == "thinking_delta"]
-        assert len(thinking_deltas) > 0, "Should have thinking_delta events"
-
-        # should have signature_delta event before thinking block closes (Anthropic API requirement)
-        signature_deltas = [e for e in events if
-            e.get("type") == "content_block_delta" and
-            e.get("delta", {}).get("type") == "signature_delta"]
-        assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
-
-        # should have text block after thinking
-        text_starts = [e for e in events if
-            e.get("type") == "content_block_start" and
-            e.get("content_block", {}).get("type") == "text"]
-        assert len(text_starts) > 0, "Should have text content_block_start event"
-        assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
-    else:
-        res = server.make_request("POST", "/v1/messages", data={
-            "model": "test",
-            "max_tokens": 1024,
-            "thinking": {
-                "type": "enabled",
-                "budget_tokens": 500
-            },
-            "messages": [
-                {"role": "user", "content": "What is 2+2?"}
-            ]
-        })
-
-        assert res.status_code == 200
-        assert res.body["type"] == "message"
-
-        content = res.body["content"]
-        assert len(content) >= 2, "Should have at least thinking and text blocks"
-
-        # first block should be thinking
-        thinking_blocks = [b for b in content if b.get("type") == "thinking"]
-        assert len(thinking_blocks) > 0, "Should have thinking content block"
-        assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
-        assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
-        assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
-
-        # should also have text block
-        text_blocks = [b for b in content if b.get("type") == "text"]
-        assert len(text_blocks) > 0, "Should have text content block"
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
        if expect_ok:
            assert res.status_code == 200
-
-        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
-        if res.status_code == 200:
            assert "content" in res.body
            if "timings" in res.body:
                assert res.body["timings"]["predicted_n"] == n_predict
+        else:
+            assert res.status_code == 500
+            assert "content" not in res.body


@pytest.mark.parametrize(