Flashinfer sample kernel (#617)

2024-07-17 13:24:43 -07:00
parent 4efcc59d4f
commit 3de2f30a27
4 changed files with 17 additions and 30 deletions
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -156,14 +156,14 @@ def extend(reqs, model_runner):
    )
    batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
    output = model_runner.forward(batch, ForwardMode.EXTEND)
-    next_token_ids, _ = batch.sample(output.next_token_logits)
+    next_token_ids = batch.sample(output.next_token_logits)
    return next_token_ids, output.next_token_logits, batch


 def decode(input_token_ids, batch, model_runner):
    batch.prepare_for_decode(input_token_ids.cpu().numpy())
    output = model_runner.forward(batch, ForwardMode.DECODE)
-    next_token_ids, _ = batch.sample(output.next_token_logits)
+    next_token_ids = batch.sample(output.next_token_logits)
    return next_token_ids, output.next_token_logits