[router][grpc] Cleanup debug logs in grpc_server and grpc_router (#11340)

2025-10-08 13:26:19 -07:00
parent 677aa0e25f
commit a65ca73911
5 changed files with 31 additions and 102 deletions
--- a/sgl-router/src/reasoning_parser/parsers/base.rs
+++ b/sgl-router/src/reasoning_parser/parsers/base.rs
@@ -2,7 +2,6 @@
 // for detecting and extracting reasoning blocks from text.

 use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
-use tracing as log;

 /// Base reasoning parser implementation.
 ///
@@ -46,18 +45,14 @@ impl BaseReasoningParser {

 impl ReasoningParser for BaseReasoningParser {
    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
-        log::debug!("detect_and_parse_reasoning called with text: {:?}", text);
-
        // Check input size against buffer limit
        if text.len() > self.config.max_buffer_size {
            return Err(ParseError::BufferOverflow(text.len()));
        }

        let in_reasoning = self.in_reasoning || text.contains(&self.config.think_start_token);
-        log::debug!("in_reasoning: {}", in_reasoning);

        if !in_reasoning {
-            log::debug!("No reasoning detected, returning normal text.");
            return Ok(ParserResult::normal(text.to_string()));
        }

@@ -66,15 +61,8 @@ impl ReasoningParser for BaseReasoningParser {
            .replace(&self.config.think_start_token, "")
            .trim()
            .to_string();
-        log::debug!(
-            "Processed text after removing think_start_token: {:?}",
-            processed_text
-        );

        if !processed_text.contains(&self.config.think_end_token) {
-            log::debug!(
-                "Reasoning truncated, think_end_token not found. Returning reasoning text."
-            );
            // Assume reasoning was truncated before end token
            return Ok(ParserResult::reasoning(processed_text));
        }
@@ -89,9 +77,6 @@ impl ReasoningParser for BaseReasoningParser {
            .map(|s| s.trim().to_string())
            .unwrap_or_default();

-        log::debug!("Extracted reasoning_text: {:?}", reasoning_text);
-        log::debug!("Extracted normal_text: {:?}", normal_text);
-
        Ok(ParserResult::new(normal_text, reasoning_text))
    }

@@ -108,19 +93,6 @@ impl ReasoningParser for BaseReasoningParser {
        self.buffer.push_str(text);
        let mut current_text = self.buffer.clone();

-        log::debug!(
-            "parse_reasoning_streaming_incremental called with text: {:?}",
-            text
-        );
-        log::debug!("current buffer: {:?}", self.buffer);
-        log::debug!("current_text: {:?}", current_text);
-        log::debug!(
-            "in_reasoning: {}, stripped_think_start: {}, stream_reasoning: {}",
-            self.in_reasoning,
-            self.stripped_think_start,
-            self.config.stream_reasoning
-        );
-
        // If the current text is a prefix of a token, keep buffering
        if self.is_partial_token(&current_text) {
            return Ok(ParserResult::default());
--- a/sgl-router/src/routers/grpc/pipeline.rs
+++ b/sgl-router/src/routers/grpc/pipeline.rs
@@ -56,8 +56,6 @@ pub struct PreparationStage;
 #[async_trait]
 impl PipelineStage for PreparationStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Processing request", self.name());
-
        // Clone Arc before match to avoid borrow checker issues
        // (matching borrows ctx, but prepare_* methods need mutable borrow)
        // Arc clone is cheap (8 bytes) - avoids full request clone (15KB-200KB)
@@ -109,7 +107,6 @@ impl PreparationStage {
        };

        let token_ids = encoding.token_ids().to_vec();
-        debug!("Tokenized {} tokens from input", token_ids.len());

        // Step 4: Build tool constraints if needed
        let tool_call_constraint = body_ref.tools.as_ref().and_then(|tools| {
@@ -157,8 +154,6 @@ impl PreparationStage {
            }
        };

-        debug!("Resolved input with {} tokens", token_ids.len());
-
        // Create stop sequence decoder for generate requests
        let params = request.sampling_params.as_ref();
        let stop_decoder = utils::create_stop_decoder(
@@ -259,8 +254,6 @@ impl WorkerSelectionStage {
 #[async_trait]
 impl PipelineStage for WorkerSelectionStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Selecting workers", self.name());
-
        let prep = ctx
            .state
            .preparation
@@ -414,8 +407,6 @@ pub struct ClientAcquisitionStage;
 #[async_trait]
 impl PipelineStage for ClientAcquisitionStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Acquiring gRPC clients", self.name());
-
        let workers = ctx
            .state
            .workers
@@ -464,8 +455,6 @@ impl RequestBuildingStage {
 #[async_trait]
 impl PipelineStage for RequestBuildingStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Building proto request", self.name());
-
        let prep = ctx
            .state
            .preparation
@@ -578,8 +567,6 @@ pub struct DispatchMetadataStage;
 #[async_trait]
 impl PipelineStage for DispatchMetadataStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Preparing dispatch metadata", self.name());
-
        let proto_request = ctx
            .state
            .proto_request
@@ -656,8 +643,6 @@ impl RequestExecutionStage {
 #[async_trait]
 impl PipelineStage for RequestExecutionStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Executing gRPC request", self.name());
-
        let proto_request = ctx
            .state
            .proto_request
@@ -713,8 +698,6 @@ impl RequestExecutionStage {
            .dual_mut()
            .ok_or_else(|| utils::internal_error_static("Expected dual clients but got single"))?;

-        debug!("Sending concurrent requests to prefill and decode workers");
-
        let prefill_request = proto_request.clone();
        let decode_request = proto_request;

@@ -780,8 +763,6 @@ impl ResponseProcessingStage {
 #[async_trait]
 impl PipelineStage for ResponseProcessingStage {
    async fn execute(&self, ctx: &mut RequestContext) -> Result<Option<Response>, Response> {
-        debug!("Stage {}: Processing response", self.name());
-
        // Delegate to request-type specific processing
        match &ctx.input.request_type {
            RequestType::Chat(_) => return self.process_chat_response(ctx).await,
@@ -1199,15 +1180,9 @@ impl ChatCompletionPipeline {

        // Execute each stage in sequence
        for (idx, stage) in self.stages.iter().enumerate() {
-            debug!("Executing stage {}: {}", idx + 1, stage.name());
            match stage.execute(&mut ctx).await {
                Ok(Some(response)) => {
                    // Stage completed successfully with a response (e.g., streaming)
-                    debug!(
-                        "Stage {} ({}) completed with response",
-                        idx + 1,
-                        stage.name()
-                    );
                    return response;
                }
                Ok(None) => {
@@ -1249,15 +1224,9 @@ impl ChatCompletionPipeline {

        // Execute each stage in sequence
        for (idx, stage) in self.stages.iter().enumerate() {
-            debug!("Executing stage {}: {}", idx + 1, stage.name());
            match stage.execute(&mut ctx).await {
                Ok(Some(response)) => {
                    // Stage completed successfully with a response (e.g., streaming)
-                    debug!(
-                        "Stage {} ({}) completed with response",
-                        idx + 1,
-                        stage.name()
-                    );
                    return response;
                }
                Ok(None) => {
--- a/sgl-router/src/routers/grpc/utils.rs
+++ b/sgl-router/src/routers/grpc/utils.rs
@@ -21,7 +21,7 @@ use serde_json::{json, Map, Value};
 use std::collections::HashMap;
 use std::sync::Arc;
 use tonic::codec::Streaming;
-use tracing::{debug, error, warn};
+use tracing::{error, warn};
 use uuid::Uuid;

 /// Get gRPC client from worker, returning appropriate error response on failure
@@ -602,10 +602,6 @@ pub async fn collect_stream_responses(
            Ok(gen_response) => {
                match gen_response.response {
                    Some(Complete(complete)) => {
-                        debug!(
-                        "{} completed: prompt_tokens={}, completion_tokens={}, finish_reason={}",
-                        worker_name, complete.prompt_tokens, complete.completion_tokens, complete.finish_reason
-                    );
                        all_responses.push(complete);
                    }
                    Some(Error(err)) => {
@@ -615,11 +611,11 @@ pub async fn collect_stream_responses(
                            worker_name, err.message
                        )));
                    }
-                    Some(Chunk(chunk)) => {
-                        debug!("{} chunk: {} tokens", worker_name, chunk.token_ids.len());
+                    Some(Chunk(_chunk)) => {
+                        // Streaming chunk - no action needed
                    }
                    None => {
-                        debug!("{}: empty response", worker_name);
+                        // Empty response - no action needed
                    }
                }
            }
@@ -633,7 +629,6 @@ pub async fn collect_stream_responses(
        }
    }

-    debug!("{} stream closed", worker_name);
    Ok(all_responses)
 }