diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index e49303937..8d325791a 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -391,7 +391,7 @@ class Scheduler( self.forward_ct = 0 self.forward_ct_decode = 0 self.num_generated_tokens = 0 - self.num_prefill_tokens = 0 + self.last_prefill_tokens = 0 self.last_decode_stats_tic = time.perf_counter() self.last_prefill_stats_tic = time.perf_counter() self.return_health_check_ct = 0 @@ -1194,8 +1194,8 @@ class Scheduler( ): gap_latency = time.perf_counter() - self.last_prefill_stats_tic self.last_prefill_stats_tic = time.perf_counter() - self.last_input_throughput = self.num_prefill_tokens / gap_latency - self.num_prefill_tokens = 0 + self.last_input_throughput = self.last_prefill_tokens / gap_latency + self.last_prefill_tokens = adder.log_input_tokens num_used = self.max_total_num_tokens - ( self.token_to_kv_pool_allocator.available_size() diff --git a/sgl-pdlb/src/server.rs b/sgl-pdlb/src/server.rs index 03af2694a..b763c743b 100644 --- a/sgl-pdlb/src/server.rs +++ b/sgl-pdlb/src/server.rs @@ -60,6 +60,17 @@ pub async fn generate( .await } +#[post("/v1/completions")] +pub async fn completions( + _req: HttpRequest, + req: web::Json, + app_state: web::Data, +) -> Result { + app_state + .generate("/v1/completions", Box::new(req.into_inner())) + .await +} + #[post("/v1/chat/completions")] pub async fn chat_completions( _req: HttpRequest, @@ -162,6 +173,7 @@ pub async fn startup(lb_config: LBConfig, lb_state: LBState) -> std::io::Result< .service(get_loads) .service(generate) .service(chat_completions) + .service(completions) }) .bind((lb_config.host, lb_config.port))? .run()