[Eagle] Remove the greedy branch and some redundant code (#4363)
Co-authored-by: Sehoon Kim <sehoon@x.ai>
This commit is contained in:
@@ -122,8 +122,8 @@ class TestEAGLEEngine(unittest.TestCase):
|
||||
|
||||
def _test_acc_length(self, engine):
|
||||
prompt = [
|
||||
"Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
|
||||
] * 5
|
||||
"Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:",
|
||||
] * 5 # test batched generation
|
||||
sampling_params = {"temperature": 0, "max_new_tokens": 512}
|
||||
output = engine.generate(prompt, sampling_params)
|
||||
output = output[0]
|
||||
|
||||
@@ -67,7 +67,7 @@ class TestFlashinferMLANoRagged(unittest.TestCase):
|
||||
"--enable-torch-compile",
|
||||
"--disable-cuda-graph",
|
||||
"--cuda-graph-max-bs",
|
||||
"2",
|
||||
"4",
|
||||
"--enable-flashinfer-mla",
|
||||
"--flashinfer-mla-disable-ragged",
|
||||
]
|
||||
@@ -109,7 +109,7 @@ class TestFlashinferMLAMTP(unittest.TestCase):
|
||||
other_args.extend(
|
||||
[
|
||||
"--cuda-graph-max-bs",
|
||||
"2",
|
||||
"4",
|
||||
"--disable-radix",
|
||||
"--enable-torch-compile",
|
||||
"--torch-compile-max-bs",
|
||||
|
||||
Reference in New Issue
Block a user