[OAI] Add rid tracing for v1/embeddings and fix rid type in Chat (#6397)

2025-05-18 13:05:38 -07:00
parent 6dc6b30637
commit 066cf44546
3 changed files with 8 additions and 3 deletions
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -918,8 +918,8 @@ class FlashAttentionBackend(AttentionBackend):
            and local_attn_metadata is not None
            and (hasattr(layer, "use_irope") and layer.use_irope)
        )
-        
-        # When Spec Decode enabled, forward_decode would be called with two mode: 
+
+        # When Spec Decode enabled, forward_decode would be called with two mode:
        # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
        # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
        use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1827,8 +1827,10 @@ def v1_embedding_request(all_requests, tokenizer_manager):
            )
        else:
            prompt_kwargs = {"input_ids": prompts}
+    request_ids = [req.request_id for req in all_requests]

    adapted_request = EmbeddingReqInput(
+        rid=request_ids,
        **prompt_kwargs,
    )

--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -393,7 +393,7 @@ class ChatCompletionRequest(BaseModel):
    chat_template_kwargs: Optional[Dict] = None

    # The request id.
-    rid: Optional[Union[List[str], str]] = None
+    rid: Optional[str] = None

    # For PD disaggregation
    bootstrap_host: Optional[str] = None
@@ -469,6 +469,9 @@ class EmbeddingRequest(BaseModel):
    dimensions: int = None
    user: Optional[str] = None

+    # The request id.
+    rid: Optional[str] = None
+

 class EmbeddingObject(BaseModel):
    embedding: List[float]