[router] refactor generate to use new pipeline arch (#11323)

This commit is contained in:
Simo Lin
2025-10-08 12:38:50 -04:00
committed by GitHub
parent d6837aea4d
commit 01c9ee1ab4
7 changed files with 713 additions and 1181 deletions

View File

@@ -2066,39 +2066,64 @@ impl GenerationRequest for GenerateRequest {
}
}
// TODO(generate): Define GenerateResponse and GenerateChoice structs
//
// Required for pipeline generate response processing (see grpc/pipeline.rs:931-964)
//
// #[derive(Debug, Clone, Serialize, Deserialize)]
// pub struct GenerateResponse {
// pub id: String,
// pub object: String, // "text.completion"
// pub created: u64,
// pub model: String,
// pub choices: Vec<GenerateChoice>,
// #[serde(skip_serializing_if = "Option::is_none")]
// pub usage: Option<Usage>,
// #[serde(skip_serializing_if = "Option::is_none")]
// pub system_fingerprint: Option<String>,
// }
//
// #[derive(Debug, Clone, Serialize, Deserialize)]
// pub struct GenerateChoice {
// pub index: u32,
// pub text: String,
// #[serde(skip_serializing_if = "Option::is_none")]
// pub output_ids: Option<Vec<u32>>,
// #[serde(skip_serializing_if = "Option::is_none")]
// pub finish_reason: Option<String>,
// #[serde(skip_serializing_if = "Option::is_none")]
// pub logprobs: Option<TopLogprobs>,
// #[serde(skip_serializing_if = "Option::is_none")]
// pub matched_stop: Option<Value>,
// }
//
// Note: Verify if similar structs already exist elsewhere before implementing.
// May need streaming variant (GenerateStreamResponse) as well.
// ============================================================================
// SGLang Generate Response Types
// ============================================================================
/// SGLang generate response (single completion or array for n>1)
///
/// Format for n=1:
/// ```json
/// {
/// "text": "...",
/// "output_ids": [...],
/// "meta_info": { ... }
/// }
/// ```
///
/// Format for n>1:
/// ```json
/// [
/// {"text": "...", "output_ids": [...], "meta_info": {...}},
/// {"text": "...", "output_ids": [...], "meta_info": {...}}
/// ]
/// ```
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenerateResponse {
pub text: String,
pub output_ids: Vec<u32>,
pub meta_info: GenerateMetaInfo,
}
/// Metadata for a single generate completion
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenerateMetaInfo {
pub id: String,
pub finish_reason: GenerateFinishReason,
pub prompt_tokens: u32,
pub weight_version: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub input_token_logprobs: Option<Vec<Vec<Option<f64>>>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub output_token_logprobs: Option<Vec<Vec<Option<f64>>>>,
pub completion_tokens: u32,
pub cached_tokens: u32,
pub e2e_latency: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub matched_stop: Option<Value>,
}
/// Finish reason for generate endpoint
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum GenerateFinishReason {
Length {
length: u32,
},
Stop,
#[serde(untagged)]
Other(Value),
}
// Constants for rerank API
pub const DEFAULT_MODEL_NAME: &str = "default";