Sync from v0.13
This commit is contained in:
311
examples/online_serving/streamlit_openai_chatbot_webserver.py
Normal file
311
examples/online_serving/streamlit_openai_chatbot_webserver.py
Normal file
@@ -0,0 +1,311 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
vLLM Chat Assistant - A Streamlit Web Interface
|
||||
|
||||
A streamlined chat interface that quickly integrates
|
||||
with vLLM API server.
|
||||
|
||||
Features:
|
||||
- Multiple chat sessions management
|
||||
- Streaming response display
|
||||
- Configurable API endpoint
|
||||
- Real-time chat history
|
||||
- Reasoning Display: Optional thinking process visualization
|
||||
|
||||
Requirements:
|
||||
pip install streamlit openai
|
||||
|
||||
Usage:
|
||||
# Start the app with default settings
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# Start with custom vLLM API endpoint
|
||||
VLLM_API_BASE="http://your-server:8000/v1" \
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# Enable debug mode
|
||||
streamlit run streamlit_openai_chatbot_webserver.py \
|
||||
--logger.level=debug
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import streamlit as st
|
||||
from openai import OpenAI
|
||||
|
||||
# Get command line arguments from environment variables
|
||||
openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
|
||||
openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
|
||||
|
||||
# Initialize session states for managing chat sessions
|
||||
if "sessions" not in st.session_state:
|
||||
st.session_state.sessions = {}
|
||||
|
||||
if "current_session" not in st.session_state:
|
||||
st.session_state.current_session = None
|
||||
|
||||
if "messages" not in st.session_state:
|
||||
st.session_state.messages = []
|
||||
|
||||
if "active_session" not in st.session_state:
|
||||
st.session_state.active_session = None
|
||||
|
||||
# Add new session state for reasoning
|
||||
if "show_reasoning" not in st.session_state:
|
||||
st.session_state.show_reasoning = {}
|
||||
|
||||
# Initialize session state for API base URL
|
||||
if "api_base_url" not in st.session_state:
|
||||
st.session_state.api_base_url = openai_api_base
|
||||
|
||||
|
||||
def create_new_chat_session():
|
||||
"""Create a new chat session with timestamp as unique identifier.
|
||||
|
||||
This function initializes a new chat session by:
|
||||
1. Generating a timestamp-based session ID
|
||||
2. Creating an empty message list for the new session
|
||||
3. Setting the new session as both current and active session
|
||||
4. Resetting the messages list for the new session
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Session State Updates:
|
||||
- sessions: Adds new empty message list with timestamp key
|
||||
- current_session: Sets to new session ID
|
||||
- active_session: Sets to new session ID
|
||||
- messages: Resets to empty list
|
||||
"""
|
||||
session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
st.session_state.sessions[session_id] = []
|
||||
st.session_state.current_session = session_id
|
||||
st.session_state.active_session = session_id
|
||||
st.session_state.messages = []
|
||||
|
||||
|
||||
def switch_to_chat_session(session_id):
|
||||
"""Switch the active chat context to a different session.
|
||||
|
||||
Args:
|
||||
session_id (str): The timestamp ID of the session to switch to
|
||||
|
||||
This function handles chat session switching by:
|
||||
1. Setting the specified session as current
|
||||
2. Updating the active session marker
|
||||
3. Loading the messages history from the specified session
|
||||
|
||||
Session State Updates:
|
||||
- current_session: Updated to specified session_id
|
||||
- active_session: Updated to specified session_id
|
||||
- messages: Loaded from sessions[session_id]
|
||||
"""
|
||||
st.session_state.current_session = session_id
|
||||
st.session_state.active_session = session_id
|
||||
st.session_state.messages = st.session_state.sessions[session_id]
|
||||
|
||||
|
||||
def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
|
||||
"""Generate and stream LLM response with optional reasoning process.
|
||||
|
||||
Args:
|
||||
messages (list): List of conversation message dicts with 'role' and 'content'
|
||||
model (str): The model identifier to use for generation
|
||||
reason (bool): Whether to enable and display reasoning process
|
||||
content_ph (streamlit.empty): Placeholder for streaming response content
|
||||
reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
|
||||
|
||||
Returns:
|
||||
tuple: (str, str)
|
||||
- First string contains the complete response text
|
||||
- Second string contains the complete reasoning text (if enabled)
|
||||
|
||||
Features:
|
||||
- Streams both reasoning and response text in real-time
|
||||
- Handles model API errors gracefully
|
||||
- Supports live updating of thinking process
|
||||
- Maintains separate content and reasoning displays
|
||||
|
||||
Raises:
|
||||
Exception: Wrapped in error message if API call fails
|
||||
|
||||
Note:
|
||||
The function uses streamlit placeholders for live updates.
|
||||
When reason=True, the reasoning process appears above the response.
|
||||
"""
|
||||
full_text = ""
|
||||
think_text = ""
|
||||
live_think = None
|
||||
# Build request parameters
|
||||
params = {"model": model, "messages": messages, "stream": True}
|
||||
if reason:
|
||||
params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(**params)
|
||||
if isinstance(response, str):
|
||||
if content_ph:
|
||||
content_ph.markdown(response)
|
||||
return response, ""
|
||||
|
||||
# Prepare reasoning expander above content
|
||||
if reason and reasoning_ph:
|
||||
exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
|
||||
live_think = exp.empty()
|
||||
|
||||
# Stream chunks
|
||||
for chunk in response:
|
||||
delta = chunk.choices[0].delta
|
||||
# Stream reasoning first
|
||||
if reason and hasattr(delta, "reasoning") and live_think:
|
||||
rc = delta.reasoning
|
||||
if rc:
|
||||
think_text += rc
|
||||
live_think.markdown(think_text + "▌")
|
||||
# Then stream content
|
||||
if hasattr(delta, "content") and delta.content and content_ph:
|
||||
full_text += delta.content
|
||||
content_ph.markdown(full_text + "▌")
|
||||
|
||||
# Finalize displays: reasoning remains above, content below
|
||||
if reason and live_think:
|
||||
live_think.markdown(think_text)
|
||||
if content_ph:
|
||||
content_ph.markdown(full_text)
|
||||
|
||||
return full_text, think_text
|
||||
except Exception as e:
|
||||
st.error(f"Error details: {str(e)}")
|
||||
return f"Error: {str(e)}", ""
|
||||
|
||||
|
||||
# Sidebar - API Settings first
|
||||
st.sidebar.title("API Settings")
|
||||
new_api_base = st.sidebar.text_input(
|
||||
"API Base URL:", value=st.session_state.api_base_url
|
||||
)
|
||||
if new_api_base != st.session_state.api_base_url:
|
||||
st.session_state.api_base_url = new_api_base
|
||||
st.rerun()
|
||||
|
||||
st.sidebar.divider()
|
||||
|
||||
# Sidebar - Session Management
|
||||
st.sidebar.title("Chat Sessions")
|
||||
if st.sidebar.button("New Session"):
|
||||
create_new_chat_session()
|
||||
|
||||
|
||||
# Display all sessions in reverse chronological order
|
||||
for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
|
||||
# Mark the active session with a pinned button
|
||||
if session_id == st.session_state.active_session:
|
||||
st.sidebar.button(
|
||||
f"📍 {session_id}",
|
||||
key=session_id,
|
||||
type="primary",
|
||||
on_click=switch_to_chat_session,
|
||||
args=(session_id,),
|
||||
)
|
||||
else:
|
||||
st.sidebar.button(
|
||||
f"Session {session_id}",
|
||||
key=session_id,
|
||||
on_click=switch_to_chat_session,
|
||||
args=(session_id,),
|
||||
)
|
||||
|
||||
# Main interface
|
||||
st.title("vLLM Chat Assistant")
|
||||
|
||||
# Initialize OpenAI client with API settings
|
||||
client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
|
||||
|
||||
# Get and display current model id
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
st.markdown(f"**Model**: {model}")
|
||||
|
||||
# Initialize first session if none exists
|
||||
if st.session_state.current_session is None:
|
||||
create_new_chat_session()
|
||||
st.session_state.active_session = st.session_state.current_session
|
||||
|
||||
# Update the chat history display section
|
||||
for idx, msg in enumerate(st.session_state.messages):
|
||||
# Render user messages normally
|
||||
if msg["role"] == "user":
|
||||
with st.chat_message("user"):
|
||||
st.write(msg["content"])
|
||||
# Render assistant messages with reasoning above
|
||||
else:
|
||||
# If reasoning exists for this assistant message, show it above the content
|
||||
if idx in st.session_state.show_reasoning:
|
||||
with st.expander("💭 Thinking Process", expanded=False):
|
||||
st.markdown(st.session_state.show_reasoning[idx])
|
||||
with st.chat_message("assistant"):
|
||||
st.write(msg["content"])
|
||||
|
||||
|
||||
# Setup & Cache reasoning support check
|
||||
@st.cache_data(show_spinner=False)
|
||||
def server_supports_reasoning():
|
||||
"""Check if the current model supports reasoning capability.
|
||||
|
||||
Returns:
|
||||
bool: True if the model supports reasoning, False otherwise
|
||||
"""
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
stream=False,
|
||||
)
|
||||
return hasattr(resp.choices[0].message, "reasoning") and bool(
|
||||
resp.choices[0].message.reasoning
|
||||
)
|
||||
|
||||
|
||||
# Check support
|
||||
supports_reasoning = server_supports_reasoning()
|
||||
|
||||
# Add reasoning toggle in sidebar if supported
|
||||
reason = False # Default to False
|
||||
if supports_reasoning:
|
||||
reason = st.sidebar.checkbox("Enable Reasoning", value=False)
|
||||
else:
|
||||
st.sidebar.markdown(
|
||||
"<span style='color:gray;'>Reasoning unavailable for this model.</span>",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
# reason remains False
|
||||
|
||||
# Update the input handling section
|
||||
if prompt := st.chat_input("Type your message here..."):
|
||||
# Save and display user message
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
st.session_state.sessions[st.session_state.current_session] = (
|
||||
st.session_state.messages
|
||||
)
|
||||
with st.chat_message("user"):
|
||||
st.write(prompt)
|
||||
|
||||
# Prepare LLM messages
|
||||
msgs = [
|
||||
{"role": m["role"], "content": m["content"]} for m in st.session_state.messages
|
||||
]
|
||||
|
||||
# Stream assistant response
|
||||
with st.chat_message("assistant"):
|
||||
# Placeholders: reasoning above, content below
|
||||
reason_ph = st.empty()
|
||||
content_ph = st.empty()
|
||||
full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
|
||||
# Determine index for this new assistant message
|
||||
message_index = len(st.session_state.messages)
|
||||
# Save assistant reply
|
||||
st.session_state.messages.append({"role": "assistant", "content": full})
|
||||
# Persist reasoning in session state if any
|
||||
if reason and think:
|
||||
st.session_state.show_reasoning[message_index] = think
|
||||
Reference in New Issue
Block a user