Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+vLLM Chat Assistant - A Streamlit Web Interface
+
+A streamlined chat interface that quickly integrates
+with vLLM API server.
+
+Features:
+- Multiple chat sessions management
+- Streaming response display
+- Configurable API endpoint
+- Real-time chat history
+- Reasoning Display: Optional thinking process visualization 
+
+Requirements:
+    pip install streamlit openai
+
+Usage:
+    # Start the app with default settings
+    streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Start with custom vLLM API endpoint
+    VLLM_API_BASE="http://your-server:8000/v1" \
+        streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Enable debug mode
+    streamlit run streamlit_openai_chatbot_webserver.py \
+        --logger.level=debug
+"""
+
+import os
+from datetime import datetime
+
+import streamlit as st
+from openai import OpenAI
+
+# Get command line arguments from environment variables
+openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
+openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
+
+# Initialize session states for managing chat sessions
+if "sessions" not in st.session_state:
+    st.session_state.sessions = {}
+
+if "current_session" not in st.session_state:
+    st.session_state.current_session = None
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+if "active_session" not in st.session_state:
+    st.session_state.active_session = None
+
+# Add new session state for reasoning
+if "show_reasoning" not in st.session_state:
+    st.session_state.show_reasoning = {}
+
+# Initialize session state for API base URL
+if "api_base_url" not in st.session_state:
+    st.session_state.api_base_url = openai_api_base
+
+
+def create_new_chat_session():
+    """Create a new chat session with timestamp as unique identifier.
+
+    This function initializes a new chat session by:
+    1. Generating a timestamp-based session ID
+    2. Creating an empty message list for the new session
+    3. Setting the new session as both current and active session
+    4. Resetting the messages list for the new session
+
+    Returns:
+        None
+
+    Session State Updates:
+        - sessions: Adds new empty message list with timestamp key
+        - current_session: Sets to new session ID
+        - active_session: Sets to new session ID
+        - messages: Resets to empty list
+    """
+    session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    st.session_state.sessions[session_id] = []
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = []
+
+
+def switch_to_chat_session(session_id):
+    """Switch the active chat context to a different session.
+
+    Args:
+        session_id (str): The timestamp ID of the session to switch to
+
+    This function handles chat session switching by:
+    1. Setting the specified session as current
+    2. Updating the active session marker
+    3. Loading the messages history from the specified session
+
+    Session State Updates:
+        - current_session: Updated to specified session_id
+        - active_session: Updated to specified session_id
+        - messages: Loaded from sessions[session_id]
+    """
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = st.session_state.sessions[session_id]
+
+
+def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
+    """Generate and stream LLM response with optional reasoning process.
+
+    Args:
+        messages (list): List of conversation message dicts with 'role' and 'content'
+        model (str): The model identifier to use for generation
+        reason (bool): Whether to enable and display reasoning process
+        content_ph (streamlit.empty): Placeholder for streaming response content
+        reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
+
+    Returns:
+        tuple: (str, str)
+            - First string contains the complete response text
+            - Second string contains the complete reasoning text (if enabled)
+
+    Features:
+        - Streams both reasoning and response text in real-time
+        - Handles model API errors gracefully
+        - Supports live updating of thinking process
+        - Maintains separate content and reasoning displays
+
+    Raises:
+        Exception: Wrapped in error message if API call fails
+
+    Note:
+        The function uses streamlit placeholders for live updates.
+        When reason=True, the reasoning process appears above the response.
+    """
+    full_text = ""
+    think_text = ""
+    live_think = None
+    # Build request parameters
+    params = {"model": model, "messages": messages, "stream": True}
+    if reason:
+        params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
+
+    try:
+        response = client.chat.completions.create(**params)
+        if isinstance(response, str):
+            if content_ph:
+                content_ph.markdown(response)
+            return response, ""
+
+        # Prepare reasoning expander above content
+        if reason and reasoning_ph:
+            exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
+            live_think = exp.empty()
+
+        # Stream chunks
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            # Stream reasoning first
+            if reason and hasattr(delta, "reasoning") and live_think:
+                rc = delta.reasoning
+                if rc:
+                    think_text += rc
+                    live_think.markdown(think_text + "▌")
+            # Then stream content
+            if hasattr(delta, "content") and delta.content and content_ph:
+                full_text += delta.content
+                content_ph.markdown(full_text + "▌")
+
+        # Finalize displays: reasoning remains above, content below
+        if reason and live_think:
+            live_think.markdown(think_text)
+        if content_ph:
+            content_ph.markdown(full_text)
+
+        return full_text, think_text
+    except Exception as e:
+        st.error(f"Error details: {str(e)}")
+        return f"Error: {str(e)}", ""
+
+
+# Sidebar - API Settings first
+st.sidebar.title("API Settings")
+new_api_base = st.sidebar.text_input(
+    "API Base URL:", value=st.session_state.api_base_url
+)
+if new_api_base != st.session_state.api_base_url:
+    st.session_state.api_base_url = new_api_base
+    st.rerun()
+
+st.sidebar.divider()
+
+# Sidebar - Session Management
+st.sidebar.title("Chat Sessions")
+if st.sidebar.button("New Session"):
+    create_new_chat_session()
+
+
+# Display all sessions in reverse chronological order
+for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
+    # Mark the active session with a pinned button
+    if session_id == st.session_state.active_session:
+        st.sidebar.button(
+            f"📍 {session_id}",
+            key=session_id,
+            type="primary",
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
+    else:
+        st.sidebar.button(
+            f"Session {session_id}",
+            key=session_id,
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
+
+# Main interface
+st.title("vLLM Chat Assistant")
+
+# Initialize OpenAI client with API settings
+client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
+
+# Get and display current model id
+models = client.models.list()
+model = models.data[0].id
+st.markdown(f"**Model**: {model}")
+
+# Initialize first session if none exists
+if st.session_state.current_session is None:
+    create_new_chat_session()
+    st.session_state.active_session = st.session_state.current_session
+
+# Update the chat history display section
+for idx, msg in enumerate(st.session_state.messages):
+    # Render user messages normally
+    if msg["role"] == "user":
+        with st.chat_message("user"):
+            st.write(msg["content"])
+    # Render assistant messages with reasoning above
+    else:
+        # If reasoning exists for this assistant message, show it above the content
+        if idx in st.session_state.show_reasoning:
+            with st.expander("💭 Thinking Process", expanded=False):
+                st.markdown(st.session_state.show_reasoning[idx])
+        with st.chat_message("assistant"):
+            st.write(msg["content"])
+
+
+# Setup & Cache reasoning support check
+@st.cache_data(show_spinner=False)
+def server_supports_reasoning():
+    """Check if the current model supports reasoning capability.
+
+    Returns:
+        bool: True if the model supports reasoning, False otherwise
+    """
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Hi"}],
+        stream=False,
+    )
+    return hasattr(resp.choices[0].message, "reasoning") and bool(
+        resp.choices[0].message.reasoning
+    )
+
+
+# Check support
+supports_reasoning = server_supports_reasoning()
+
+# Add reasoning toggle in sidebar if supported
+reason = False  # Default to False
+if supports_reasoning:
+    reason = st.sidebar.checkbox("Enable Reasoning", value=False)
+else:
+    st.sidebar.markdown(
+        "<span style='color:gray;'>Reasoning unavailable for this model.</span>",
+        unsafe_allow_html=True,
+    )
+    # reason remains False
+
+# Update the input handling section
+if prompt := st.chat_input("Type your message here..."):
+    # Save and display user message
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.session_state.sessions[st.session_state.current_session] = (
+        st.session_state.messages
+    )
+    with st.chat_message("user"):
+        st.write(prompt)
+
+    # Prepare LLM messages
+    msgs = [
+        {"role": m["role"], "content": m["content"]} for m in st.session_state.messages
+    ]
+
+    # Stream assistant response
+    with st.chat_message("assistant"):
+        # Placeholders: reasoning above, content below
+        reason_ph = st.empty()
+        content_ph = st.empty()
+        full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
+        # Determine index for this new assistant message
+        message_index = len(st.session_state.messages)
+        # Save assistant reply
+        st.session_state.messages.append({"role": "assistant", "content": full})
+        # Persist reasoning in session state if any
+        if reason and think:
+            st.session_state.show_reasoning[message_index] = think