[OAI] Add rid tracing for v1/embeddings and fix rid type in Chat (#6397)
This commit is contained in:
@@ -918,8 +918,8 @@ class FlashAttentionBackend(AttentionBackend):
|
|||||||
and local_attn_metadata is not None
|
and local_attn_metadata is not None
|
||||||
and (hasattr(layer, "use_irope") and layer.use_irope)
|
and (hasattr(layer, "use_irope") and layer.use_irope)
|
||||||
)
|
)
|
||||||
|
|
||||||
# When Spec Decode enabled, forward_decode would be called with two mode:
|
# When Spec Decode enabled, forward_decode would be called with two mode:
|
||||||
# 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
|
# 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
|
||||||
# 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
|
# 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
|
||||||
use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1
|
use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1
|
||||||
|
|||||||
@@ -1827,8 +1827,10 @@ def v1_embedding_request(all_requests, tokenizer_manager):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt_kwargs = {"input_ids": prompts}
|
prompt_kwargs = {"input_ids": prompts}
|
||||||
|
request_ids = [req.request_id for req in all_requests]
|
||||||
|
|
||||||
adapted_request = EmbeddingReqInput(
|
adapted_request = EmbeddingReqInput(
|
||||||
|
rid=request_ids,
|
||||||
**prompt_kwargs,
|
**prompt_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -393,7 +393,7 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
chat_template_kwargs: Optional[Dict] = None
|
chat_template_kwargs: Optional[Dict] = None
|
||||||
|
|
||||||
# The request id.
|
# The request id.
|
||||||
rid: Optional[Union[List[str], str]] = None
|
rid: Optional[str] = None
|
||||||
|
|
||||||
# For PD disaggregation
|
# For PD disaggregation
|
||||||
bootstrap_host: Optional[str] = None
|
bootstrap_host: Optional[str] = None
|
||||||
@@ -469,6 +469,9 @@ class EmbeddingRequest(BaseModel):
|
|||||||
dimensions: int = None
|
dimensions: int = None
|
||||||
user: Optional[str] = None
|
user: Optional[str] = None
|
||||||
|
|
||||||
|
# The request id.
|
||||||
|
rid: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingObject(BaseModel):
|
class EmbeddingObject(BaseModel):
|
||||||
embedding: List[float]
|
embedding: List[float]
|
||||||
|
|||||||
Reference in New Issue
Block a user