From 066cf44546960540a52b3f32f6840f9492147b50 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Sun, 18 May 2025 13:05:38 -0700 Subject: [PATCH] [OAI] Add rid tracing for v1/embeddings and fix rid type in Chat (#6397) --- python/sglang/srt/layers/attention/flashattention_backend.py | 4 ++-- python/sglang/srt/openai_api/adapter.py | 2 ++ python/sglang/srt/openai_api/protocol.py | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index 4ce337580..9b47509b2 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -918,8 +918,8 @@ class FlashAttentionBackend(AttentionBackend): and local_attn_metadata is not None and (hasattr(layer, "use_irope") and layer.use_irope) ) - - # When Spec Decode enabled, forward_decode would be called with two mode:
 + + # When Spec Decode enabled, forward_decode would be called with two mode: # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1 # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1 diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index c24979ae8..1b37d0219 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -1827,8 +1827,10 @@ def v1_embedding_request(all_requests, tokenizer_manager): ) else: prompt_kwargs = {"input_ids": prompts} + request_ids = [req.request_id for req in all_requests] adapted_request = EmbeddingReqInput( + rid=request_ids, **prompt_kwargs, ) diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 0cdab70cd..7c40a70dc 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -393,7 +393,7 @@ class ChatCompletionRequest(BaseModel): chat_template_kwargs: Optional[Dict] = None # The request id. - rid: Optional[Union[List[str], str]] = None + rid: Optional[str] = None # For PD disaggregation bootstrap_host: Optional[str] = None @@ -469,6 +469,9 @@ class EmbeddingRequest(BaseModel): dimensions: int = None user: Optional[str] = None + # The request id. + rid: Optional[str] = None + class EmbeddingObject(BaseModel): embedding: List[float]