router: Support parallel sampling num > 1 in grpc_server and non-stream handling (#10929)

This commit is contained in:
Chang Su
2025-09-25 20:03:35 -07:00
committed by GitHub
parent 3e95aa1a09
commit 37158f2018
8 changed files with 281 additions and 135 deletions

View File

@@ -122,6 +122,9 @@ message GenerateRequest {
// For load balancing
int32 dp_balance_id = 17;
// Whether client wants streaming response
bool stream = 18;
}
message TokenizedInput {
@@ -163,8 +166,8 @@ message GenerateResponse {
}
message GenerateStreamChunk {
// Generated token
int32 token_id = 1;
// Generated tokens (incremental chunk)
repeated int32 token_ids = 1;
// Cumulative counts
int32 prompt_tokens = 2;