From 368fd20622a8055b53992602dc8ce6c994e8367e Mon Sep 17 00:00:00 2001 From: Simo Lin Date: Wed, 8 Oct 2025 22:23:08 -0400 Subject: [PATCH] [router][grpc] disable health check generation and increase timeout (#11353) --- python/sglang/srt/entrypoints/grpc_server.py | 76 +------------------ .../src/grpc_client/sglang_scheduler.rs | 2 +- 2 files changed, 5 insertions(+), 73 deletions(-) diff --git a/python/sglang/srt/entrypoints/grpc_server.py b/python/sglang/srt/entrypoints/grpc_server.py index 8a090da90..e94e0e813 100644 --- a/python/sglang/srt/entrypoints/grpc_server.py +++ b/python/sglang/srt/entrypoints/grpc_server.py @@ -313,78 +313,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer) request: sglang_scheduler_pb2.HealthCheckRequest, context: grpc.aio.ServicerContext, ) -> sglang_scheduler_pb2.HealthCheckResponse: - """Health check by generating from client input.""" - try: - # Check if request manager is shutting down - if self.request_manager.gracefully_exit: - return sglang_scheduler_pb2.HealthCheckResponse( - healthy=False, message="Server shutting down" - ) - - # Extract tokenized input from request - if not request.HasField("tokenized"): - return sglang_scheduler_pb2.HealthCheckResponse( - healthy=False, message="Tokenized input required for health check" - ) - - input_text = request.tokenized.original_text - input_ids = list(request.tokenized.input_ids) - - # Create health check request - rid = f"HEALTH_CHECK_GRPC_{time.time()}" - - health_request = TokenizedGenerateReqInput( - rid=rid, - input_text=input_text, - input_ids=input_ids, - sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0), - stream=False, - mm_inputs=None, - return_logprob=False, - logprob_start_len=-1, - top_logprobs_num=0, - token_ids_logprob=None, - ) - - if self.server_args.disaggregation_mode != DisaggregationMode.NULL: - health_request.bootstrap_host = FAKE_BOOTSTRAP_HOST - health_request.bootstrap_room = 0 - - logger.debug(f"Receive health check request: {rid}") - - # Submit and wait for response - output_generator = self.request_manager.generate_request( - health_request, request_id=rid - ) - - try: - # Get first response with timeout - response = await asyncio.wait_for( - output_generator.__anext__(), timeout=HEALTH_CHECK_TIMEOUT - ) - - # Clean up - if rid in self.request_manager.rid_to_state: - del self.request_manager.rid_to_state[rid] - - return sglang_scheduler_pb2.HealthCheckResponse( - healthy=True, message="Health check passed" - ) - - except asyncio.TimeoutError: - # Clean up on timeout - if rid in self.request_manager.rid_to_state: - del self.request_manager.rid_to_state[rid] - - return sglang_scheduler_pb2.HealthCheckResponse( - healthy=False, message="Health check timeout" - ) - - except Exception as e: - logger.error(f"Health check failed: {e}\n{get_exception_traceback()}") - return sglang_scheduler_pb2.HealthCheckResponse( - healthy=False, message=f"Health check error: {str(e)}" - ) + """Health check - always returns healthy after server started.""" + return sglang_scheduler_pb2.HealthCheckResponse( + healthy=True, message="Health check passed" + ) async def Abort( self, diff --git a/sgl-router/src/grpc_client/sglang_scheduler.rs b/sgl-router/src/grpc_client/sglang_scheduler.rs index ab94c4c07..a9d9a11f1 100644 --- a/sgl-router/src/grpc_client/sglang_scheduler.rs +++ b/sgl-router/src/grpc_client/sglang_scheduler.rs @@ -35,7 +35,7 @@ impl SglangSchedulerClient { }; let channel = Channel::from_shared(http_endpoint)? - .timeout(Duration::from_secs(30)) + .timeout(Duration::from_secs(3600)) .http2_keep_alive_interval(Duration::from_secs(30)) .keep_alive_timeout(Duration::from_secs(10)) .keep_alive_while_idle(true)