Improve test cases for eagle infer (#7173)
This commit is contained in:
@@ -31,8 +31,8 @@ suites = {
|
|||||||
TestFile("test_block_int8.py", 22),
|
TestFile("test_block_int8.py", 22),
|
||||||
TestFile("test_create_kvindices.py", 2),
|
TestFile("test_create_kvindices.py", 2),
|
||||||
TestFile("test_chunked_prefill.py", 313),
|
TestFile("test_chunked_prefill.py", 313),
|
||||||
TestFile("test_eagle_infer_a.py", 300),
|
TestFile("test_eagle_infer_a.py", 370),
|
||||||
TestFile("test_eagle_infer_b.py", 300),
|
TestFile("test_eagle_infer_b.py", 270),
|
||||||
TestFile("test_ebnf_constrained.py", 108),
|
TestFile("test_ebnf_constrained.py", 108),
|
||||||
TestFile("test_enable_thinking.py", 70),
|
TestFile("test_enable_thinking.py", 70),
|
||||||
TestFile("test_embedding_openai_server.py", 141),
|
TestFile("test_embedding_openai_server.py", 141),
|
||||||
|
|||||||
@@ -129,7 +129,7 @@ class TestEAGLEEngine(CustomTestCase):
|
|||||||
output["meta_info"]["completion_tokens"]
|
output["meta_info"]["completion_tokens"]
|
||||||
/ output["meta_info"]["e2e_latency"]
|
/ output["meta_info"]["e2e_latency"]
|
||||||
)
|
)
|
||||||
print(f"{acc_length=}")
|
print(f"{acc_length=:.4f}, {speed=}")
|
||||||
|
|
||||||
if engine.server_args.model_path == DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST:
|
if engine.server_args.model_path == DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST:
|
||||||
self.assertGreater(acc_length, 3.6)
|
self.assertGreater(acc_length, 3.6)
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ from types import SimpleNamespace
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
import torch
|
|
||||||
|
|
||||||
from sglang.srt.utils import kill_process_tree
|
from sglang.srt.utils import kill_process_tree
|
||||||
from sglang.test.few_shot_gsm8k import run_eval
|
from sglang.test.few_shot_gsm8k import run_eval
|
||||||
@@ -24,10 +23,6 @@ from sglang.test.test_utils import (
|
|||||||
run_logprob_check,
|
run_logprob_check,
|
||||||
)
|
)
|
||||||
|
|
||||||
torch_dtype = torch.float16
|
|
||||||
prefill_tolerance = 5e-2
|
|
||||||
decode_tolerance: float = 5e-2
|
|
||||||
|
|
||||||
|
|
||||||
class TestEAGLEServer(CustomTestCase):
|
class TestEAGLEServer(CustomTestCase):
|
||||||
PROMPTS = [
|
PROMPTS = [
|
||||||
@@ -202,7 +197,11 @@ class TestEAGLEServer(CustomTestCase):
|
|||||||
"""Test the output logprobs are close to the input logprobs if we run a prefill again."""
|
"""Test the output logprobs are close to the input logprobs if we run a prefill again."""
|
||||||
|
|
||||||
def run_generate(
|
def run_generate(
|
||||||
prompt, return_logprob=False, max_new_tokens=512, logprob_start_len=-1
|
prompt,
|
||||||
|
return_logprob=False,
|
||||||
|
max_new_tokens=512,
|
||||||
|
logprob_start_len=-1,
|
||||||
|
temperature=1.0,
|
||||||
):
|
):
|
||||||
|
|
||||||
if isinstance(prompt, str):
|
if isinstance(prompt, str):
|
||||||
@@ -215,20 +214,27 @@ class TestEAGLEServer(CustomTestCase):
|
|||||||
json={
|
json={
|
||||||
**prompt_kwargs,
|
**prompt_kwargs,
|
||||||
"sampling_params": {
|
"sampling_params": {
|
||||||
"temperature": 1.0,
|
"temperature": temperature,
|
||||||
"max_new_tokens": max_new_tokens,
|
"max_new_tokens": max_new_tokens,
|
||||||
"ignore_eos": True,
|
"ignore_eos": True,
|
||||||
},
|
},
|
||||||
"return_logprob": return_logprob,
|
"return_logprob": return_logprob,
|
||||||
"return_text_in_logprobs": True,
|
"return_text_in_logprobs": True,
|
||||||
"logprob_start_len": logprob_start_len,
|
"logprob_start_len": logprob_start_len,
|
||||||
|
"temp_scaled_logprobs": True,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
prompt = "I have a very good idea on how to"
|
prompt = "I have a very good idea on how to"
|
||||||
|
|
||||||
gen = run_generate(prompt, return_logprob=True, logprob_start_len=0)
|
for temperature in [1.0]:
|
||||||
|
gen = run_generate(
|
||||||
|
prompt,
|
||||||
|
return_logprob=True,
|
||||||
|
logprob_start_len=0,
|
||||||
|
temperature=temperature,
|
||||||
|
)
|
||||||
output_logprobs = np.array(
|
output_logprobs = np.array(
|
||||||
[x[0] for x in gen["meta_info"]["output_token_logprobs"]]
|
[x[0] for x in gen["meta_info"]["output_token_logprobs"]]
|
||||||
)
|
)
|
||||||
@@ -239,12 +245,18 @@ class TestEAGLEServer(CustomTestCase):
|
|||||||
|
|
||||||
new_prompt = input_tokens + output_tokens
|
new_prompt = input_tokens + output_tokens
|
||||||
score = run_generate(
|
score = run_generate(
|
||||||
new_prompt, return_logprob=True, logprob_start_len=0, max_new_tokens=0
|
new_prompt,
|
||||||
|
return_logprob=True,
|
||||||
|
logprob_start_len=0,
|
||||||
|
max_new_tokens=0,
|
||||||
|
temperature=temperature,
|
||||||
)
|
)
|
||||||
output_logprobs_score = np.array(
|
output_logprobs_score = np.array(
|
||||||
[
|
[
|
||||||
x[0]
|
x[0]
|
||||||
for x in score["meta_info"]["input_token_logprobs"][num_prompts_tokens:]
|
for x in score["meta_info"]["input_token_logprobs"][
|
||||||
|
num_prompts_tokens:
|
||||||
|
]
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -253,7 +265,7 @@ class TestEAGLEServer(CustomTestCase):
|
|||||||
|
|
||||||
diff = np.abs(output_logprobs - output_logprobs_score)
|
diff = np.abs(output_logprobs - output_logprobs_score)
|
||||||
max_diff = np.max(diff)
|
max_diff = np.max(diff)
|
||||||
self.assertLess(max_diff, 0.25)
|
self.assertLess(max_diff, 0.255)
|
||||||
|
|
||||||
def test_logprob_mixed(self):
|
def test_logprob_mixed(self):
|
||||||
args = []
|
args = []
|
||||||
|
|||||||
Reference in New Issue
Block a user