sync from b7516
This commit is contained in:
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
|
||||
@pytest.mark.parametrize(
|
||||
"n_batch,batch_count,reuse_cache",
|
||||
[
|
||||
(64, 4, False),
|
||||
(64, 2, True),
|
||||
(64, 3, False),
|
||||
(64, 1, True),
|
||||
]
|
||||
)
|
||||
def test_return_progress(n_batch, batch_count, reuse_cache):
|
||||
@@ -462,18 +462,10 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
|
||||
res = make_cmpl_request()
|
||||
last_progress = None
|
||||
total_batch_count = 0
|
||||
|
||||
for data in res:
|
||||
cur_progress = data.get("prompt_progress", None)
|
||||
if cur_progress is None:
|
||||
continue
|
||||
if total_batch_count == 0:
|
||||
# first progress report must have n_cache == n_processed
|
||||
assert cur_progress["total"] > 0
|
||||
assert cur_progress["cache"] == cur_progress["processed"]
|
||||
if reuse_cache:
|
||||
# when reusing cache, we expect some cached tokens
|
||||
assert cur_progress["cache"] > 0
|
||||
if last_progress is not None:
|
||||
assert cur_progress["total"] == last_progress["total"]
|
||||
assert cur_progress["cache"] == last_progress["cache"]
|
||||
@@ -481,7 +473,6 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
|
||||
total_batch_count += 1
|
||||
last_progress = cur_progress
|
||||
|
||||
# last progress should indicate completion (all tokens processed)
|
||||
assert last_progress is not None
|
||||
assert last_progress["total"] > 0
|
||||
assert last_progress["processed"] == last_progress["total"]
|
||||
@@ -491,22 +482,17 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
|
||||
def test_chat_completions_multiple_choices():
|
||||
global server
|
||||
server.start()
|
||||
# make sure cache can be reused across multiple choices and multiple requests
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/18663
|
||||
for _ in range(2):
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"n": 2,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
# test forcing the same slot to be used
|
||||
# the scheduler should not be locked up in this case
|
||||
"id_slot": 0,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["choices"]) == 2
|
||||
for choice in res.body["choices"]:
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert choice["finish_reason"] == "length"
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"n": 2,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["choices"]) == 2
|
||||
for choice in res.body["choices"]:
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert match_regex("Suddenly", choice["message"]["content"])
|
||||
assert choice["finish_reason"] == "length"
|
||||
|
||||
@@ -805,92 +805,3 @@ def test_anthropic_vs_openai_different_response_format():
|
||||
assert "input_tokens" in anthropic_res.body["usage"]
|
||||
assert "completion_tokens" in openai_res.body["usage"]
|
||||
assert "output_tokens" in anthropic_res.body["usage"]
|
||||
|
||||
|
||||
# Extended thinking tests with reasoning models
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("stream", [False, True])
|
||||
def test_anthropic_thinking_with_reasoning_model(stream):
|
||||
"""Test that thinking content blocks are properly returned for reasoning models"""
|
||||
global server
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
|
||||
server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
|
||||
server.reasoning_format = "deepseek"
|
||||
server.jinja = True
|
||||
server.n_ctx = 8192
|
||||
server.n_predict = 1024
|
||||
server.server_port = 8084
|
||||
server.start(timeout_seconds=600) # large model needs time to download
|
||||
|
||||
if stream:
|
||||
res = server.make_stream_request("POST", "/v1/messages", data={
|
||||
"model": "test",
|
||||
"max_tokens": 1024,
|
||||
"thinking": {
|
||||
"type": "enabled",
|
||||
"budget_tokens": 500
|
||||
},
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is 2+2?"}
|
||||
],
|
||||
"stream": True
|
||||
})
|
||||
|
||||
events = list(res)
|
||||
|
||||
# should have thinking content block events
|
||||
thinking_starts = [e for e in events if
|
||||
e.get("type") == "content_block_start" and
|
||||
e.get("content_block", {}).get("type") == "thinking"]
|
||||
assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
|
||||
assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
|
||||
|
||||
# should have thinking_delta events
|
||||
thinking_deltas = [e for e in events if
|
||||
e.get("type") == "content_block_delta" and
|
||||
e.get("delta", {}).get("type") == "thinking_delta"]
|
||||
assert len(thinking_deltas) > 0, "Should have thinking_delta events"
|
||||
|
||||
# should have signature_delta event before thinking block closes (Anthropic API requirement)
|
||||
signature_deltas = [e for e in events if
|
||||
e.get("type") == "content_block_delta" and
|
||||
e.get("delta", {}).get("type") == "signature_delta"]
|
||||
assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
|
||||
|
||||
# should have text block after thinking
|
||||
text_starts = [e for e in events if
|
||||
e.get("type") == "content_block_start" and
|
||||
e.get("content_block", {}).get("type") == "text"]
|
||||
assert len(text_starts) > 0, "Should have text content_block_start event"
|
||||
assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
|
||||
else:
|
||||
res = server.make_request("POST", "/v1/messages", data={
|
||||
"model": "test",
|
||||
"max_tokens": 1024,
|
||||
"thinking": {
|
||||
"type": "enabled",
|
||||
"budget_tokens": 500
|
||||
},
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is 2+2?"}
|
||||
]
|
||||
})
|
||||
|
||||
assert res.status_code == 200
|
||||
assert res.body["type"] == "message"
|
||||
|
||||
content = res.body["content"]
|
||||
assert len(content) >= 2, "Should have at least thinking and text blocks"
|
||||
|
||||
# first block should be thinking
|
||||
thinking_blocks = [b for b in content if b.get("type") == "thinking"]
|
||||
assert len(thinking_blocks) > 0, "Should have thinking content block"
|
||||
assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
|
||||
assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
|
||||
assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
|
||||
|
||||
# should also have text block
|
||||
text_blocks = [b for b in content if b.get("type") == "text"]
|
||||
assert len(text_blocks) > 0, "Should have text content block"
|
||||
|
||||
@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
|
||||
for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
|
||||
if expect_ok:
|
||||
assert res.status_code == 200
|
||||
|
||||
# note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
|
||||
if res.status_code == 200:
|
||||
assert "content" in res.body
|
||||
if "timings" in res.body:
|
||||
assert res.body["timings"]["predicted_n"] == n_predict
|
||||
else:
|
||||
assert res.status_code == 500
|
||||
assert "content" not in res.body
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
Reference in New Issue
Block a user