SvelteKit-based WebUI (#14839)
This commit is contained in:
committed by
GitHub
parent
8f8f2274ee
commit
a7a98e0fff
715
tools/server/webui/src/lib/services/chat.ts
Normal file
715
tools/server/webui/src/lib/services/chat.ts
Normal file
@@ -0,0 +1,715 @@
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import { slotsService } from './slots';
|
||||
/**
|
||||
* ChatService - Low-level API communication layer for llama.cpp server interactions
|
||||
*
|
||||
* This service handles direct communication with the llama.cpp server's chat completion API.
|
||||
* It provides the network layer abstraction for AI model interactions while remaining
|
||||
* stateless and focused purely on API communication.
|
||||
*
|
||||
* **Architecture & Relationship with ChatStore:**
|
||||
* - **ChatService** (this class): Stateless API communication layer
|
||||
* - Handles HTTP requests/responses with llama.cpp server
|
||||
* - Manages streaming and non-streaming response parsing
|
||||
* - Provides request abortion capabilities
|
||||
* - Converts database messages to API format
|
||||
* - Handles error translation and context detection
|
||||
*
|
||||
* - **ChatStore**: Stateful orchestration and UI state management
|
||||
* - Uses ChatService for all AI model communication
|
||||
* - Manages conversation state, message history, and UI reactivity
|
||||
* - Coordinates with DatabaseStore for persistence
|
||||
* - Handles complex workflows like branching and regeneration
|
||||
*
|
||||
* **Key Responsibilities:**
|
||||
* - Message format conversion (DatabaseMessage → API format)
|
||||
* - Streaming response handling with real-time callbacks
|
||||
* - Reasoning content extraction and processing
|
||||
* - File attachment processing (images, PDFs, audio, text)
|
||||
* - Context error detection and reporting
|
||||
* - Request lifecycle management (abort, cleanup)
|
||||
*/
|
||||
export class ChatService {
|
||||
private abortController: AbortController | null = null;
|
||||
|
||||
/**
|
||||
* Sends a chat completion request to the llama.cpp server.
|
||||
* Supports both streaming and non-streaming responses with comprehensive parameter configuration.
|
||||
* Automatically converts database messages with attachments to the appropriate API format.
|
||||
*
|
||||
* @param messages - Array of chat messages to send to the API (supports both ApiChatMessageData and DatabaseMessage with attachments)
|
||||
* @param options - Configuration options for the chat completion request. See `SettingsChatServiceOptions` type for details.
|
||||
* @returns {Promise<string | void>} that resolves to the complete response string (non-streaming) or void (streaming)
|
||||
* @throws {Error} if the request fails or is aborted
|
||||
*/
|
||||
async sendMessage(
|
||||
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
|
||||
options: SettingsChatServiceOptions = {}
|
||||
): Promise<string | void> {
|
||||
const {
|
||||
stream,
|
||||
onChunk,
|
||||
onComplete,
|
||||
onError,
|
||||
// Generation parameters
|
||||
temperature,
|
||||
max_tokens,
|
||||
// Sampling parameters
|
||||
dynatemp_range,
|
||||
dynatemp_exponent,
|
||||
top_k,
|
||||
top_p,
|
||||
min_p,
|
||||
xtc_probability,
|
||||
xtc_threshold,
|
||||
typ_p,
|
||||
// Penalty parameters
|
||||
repeat_last_n,
|
||||
repeat_penalty,
|
||||
presence_penalty,
|
||||
frequency_penalty,
|
||||
dry_multiplier,
|
||||
dry_base,
|
||||
dry_allowed_length,
|
||||
dry_penalty_last_n,
|
||||
// Other parameters
|
||||
samplers,
|
||||
custom,
|
||||
timings_per_token
|
||||
} = options;
|
||||
|
||||
// Cancel any ongoing request and create a new abort controller
|
||||
this.abort();
|
||||
this.abortController = new AbortController();
|
||||
|
||||
// Convert database messages with attachments to API format if needed
|
||||
const normalizedMessages: ApiChatMessageData[] = messages
|
||||
.map((msg) => {
|
||||
// Check if this is a DatabaseMessage by checking for DatabaseMessage-specific fields
|
||||
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
|
||||
// This is a DatabaseMessage, convert it
|
||||
const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] };
|
||||
return ChatService.convertMessageToChatServiceData(dbMsg);
|
||||
} else {
|
||||
// This is already an ApiChatMessageData object
|
||||
return msg as ApiChatMessageData;
|
||||
}
|
||||
})
|
||||
.filter((msg) => {
|
||||
// Filter out empty system messages
|
||||
if (msg.role === 'system') {
|
||||
const content = typeof msg.content === 'string' ? msg.content : '';
|
||||
|
||||
return content.trim().length > 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
// Build base request body with system message injection
|
||||
const processedMessages = this.injectSystemMessage(normalizedMessages);
|
||||
|
||||
const requestBody: ApiChatCompletionRequest = {
|
||||
messages: processedMessages.map((msg: ApiChatMessageData) => ({
|
||||
role: msg.role,
|
||||
content: msg.content
|
||||
})),
|
||||
stream
|
||||
};
|
||||
|
||||
requestBody.reasoning_format = 'auto';
|
||||
|
||||
if (temperature !== undefined) requestBody.temperature = temperature;
|
||||
// Set max_tokens to -1 (infinite) if not provided or empty
|
||||
requestBody.max_tokens =
|
||||
max_tokens !== undefined && max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
|
||||
|
||||
if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
|
||||
if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;
|
||||
if (top_k !== undefined) requestBody.top_k = top_k;
|
||||
if (top_p !== undefined) requestBody.top_p = top_p;
|
||||
if (min_p !== undefined) requestBody.min_p = min_p;
|
||||
if (xtc_probability !== undefined) requestBody.xtc_probability = xtc_probability;
|
||||
if (xtc_threshold !== undefined) requestBody.xtc_threshold = xtc_threshold;
|
||||
if (typ_p !== undefined) requestBody.typ_p = typ_p;
|
||||
|
||||
if (repeat_last_n !== undefined) requestBody.repeat_last_n = repeat_last_n;
|
||||
if (repeat_penalty !== undefined) requestBody.repeat_penalty = repeat_penalty;
|
||||
if (presence_penalty !== undefined) requestBody.presence_penalty = presence_penalty;
|
||||
if (frequency_penalty !== undefined) requestBody.frequency_penalty = frequency_penalty;
|
||||
if (dry_multiplier !== undefined) requestBody.dry_multiplier = dry_multiplier;
|
||||
if (dry_base !== undefined) requestBody.dry_base = dry_base;
|
||||
if (dry_allowed_length !== undefined) requestBody.dry_allowed_length = dry_allowed_length;
|
||||
if (dry_penalty_last_n !== undefined) requestBody.dry_penalty_last_n = dry_penalty_last_n;
|
||||
|
||||
if (samplers !== undefined) {
|
||||
requestBody.samplers =
|
||||
typeof samplers === 'string'
|
||||
? samplers.split(';').filter((s: string) => s.trim())
|
||||
: samplers;
|
||||
}
|
||||
|
||||
if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
|
||||
|
||||
if (custom) {
|
||||
try {
|
||||
const customParams = typeof custom === 'string' ? JSON.parse(custom) : custom;
|
||||
Object.assign(requestBody, customParams);
|
||||
} catch (error) {
|
||||
console.warn('Failed to parse custom parameters:', error);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const currentConfig = config();
|
||||
const apiKey = currentConfig.apiKey?.toString().trim();
|
||||
|
||||
const response = await fetch(`/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
|
||||
},
|
||||
body: JSON.stringify(requestBody),
|
||||
signal: this.abortController.signal
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
// Use the new parseErrorResponse method to handle structured errors
|
||||
const error = await this.parseErrorResponse(response);
|
||||
if (onError) {
|
||||
onError(error);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (stream) {
|
||||
return this.handleStreamResponse(
|
||||
response,
|
||||
onChunk,
|
||||
onComplete,
|
||||
onError,
|
||||
options.onReasoningChunk
|
||||
);
|
||||
} else {
|
||||
return this.handleNonStreamResponse(response, onComplete, onError);
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.name === 'AbortError') {
|
||||
console.log('Chat completion request was aborted');
|
||||
return;
|
||||
}
|
||||
|
||||
let userFriendlyError: Error;
|
||||
|
||||
if (error instanceof Error) {
|
||||
if (error.name === 'TypeError' && error.message.includes('fetch')) {
|
||||
userFriendlyError = new Error(
|
||||
'Unable to connect to server - please check if the server is running'
|
||||
);
|
||||
} else if (error.message.includes('ECONNREFUSED')) {
|
||||
userFriendlyError = new Error('Connection refused - server may be offline');
|
||||
} else if (error.message.includes('ETIMEDOUT')) {
|
||||
userFriendlyError = new Error('Request timeout - server may be overloaded');
|
||||
} else {
|
||||
userFriendlyError = error;
|
||||
}
|
||||
} else {
|
||||
userFriendlyError = new Error('Unknown error occurred while sending message');
|
||||
}
|
||||
|
||||
console.error('Error in sendMessage:', error);
|
||||
if (onError) {
|
||||
onError(userFriendlyError);
|
||||
}
|
||||
throw userFriendlyError;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles streaming response from the chat completion API.
|
||||
* Processes server-sent events and extracts content chunks from the stream.
|
||||
*
|
||||
* @param response - The fetch Response object containing the streaming data
|
||||
* @param onChunk - Optional callback invoked for each content chunk received
|
||||
* @param onComplete - Optional callback invoked when the stream is complete with full response
|
||||
* @param onError - Optional callback invoked if an error occurs during streaming
|
||||
* @param onReasoningChunk - Optional callback invoked for each reasoning content chunk
|
||||
* @returns {Promise<void>} Promise that resolves when streaming is complete
|
||||
* @throws {Error} if the stream cannot be read or parsed
|
||||
*/
|
||||
private async handleStreamResponse(
|
||||
response: Response,
|
||||
onChunk?: (chunk: string) => void,
|
||||
onComplete?: (
|
||||
response: string,
|
||||
reasoningContent?: string,
|
||||
timings?: ChatMessageTimings
|
||||
) => void,
|
||||
onError?: (error: Error) => void,
|
||||
onReasoningChunk?: (chunk: string) => void
|
||||
): Promise<void> {
|
||||
const reader = response.body?.getReader();
|
||||
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
let fullResponse = '';
|
||||
let fullReasoningContent = '';
|
||||
let regularContent = '';
|
||||
let insideThinkTag = false;
|
||||
let hasReceivedData = false;
|
||||
let lastTimings: ChatMessageTimings | undefined;
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const chunk = decoder.decode(value, { stream: true });
|
||||
const lines = chunk.split('\n');
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('data: ')) {
|
||||
const data = line.slice(6);
|
||||
if (data === '[DONE]') {
|
||||
if (!hasReceivedData && fullResponse.length === 0) {
|
||||
const contextError = new Error(
|
||||
'The request exceeds the available context size. Try increasing the context size or enable context shift.'
|
||||
);
|
||||
contextError.name = 'ContextError';
|
||||
onError?.(contextError);
|
||||
return;
|
||||
}
|
||||
|
||||
onComplete?.(regularContent, fullReasoningContent || undefined, lastTimings);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
|
||||
|
||||
const content = parsed.choices[0]?.delta?.content;
|
||||
const reasoningContent = parsed.choices[0]?.delta?.reasoning_content;
|
||||
const timings = parsed.timings;
|
||||
const promptProgress = parsed.prompt_progress;
|
||||
|
||||
if (timings || promptProgress) {
|
||||
this.updateProcessingState(timings, promptProgress);
|
||||
|
||||
// Store the latest timing data
|
||||
if (timings) {
|
||||
lastTimings = timings;
|
||||
}
|
||||
}
|
||||
|
||||
if (content) {
|
||||
hasReceivedData = true;
|
||||
fullResponse += content;
|
||||
|
||||
// Track the regular content before processing this chunk
|
||||
const regularContentBefore = regularContent;
|
||||
|
||||
// Process content character by character to handle think tags
|
||||
insideThinkTag = this.processContentForThinkTags(
|
||||
content,
|
||||
insideThinkTag,
|
||||
() => {
|
||||
// Think content is ignored - we don't include it in API requests
|
||||
},
|
||||
(regularChunk) => {
|
||||
regularContent += regularChunk;
|
||||
}
|
||||
);
|
||||
|
||||
const newRegularContent = regularContent.slice(regularContentBefore.length);
|
||||
if (newRegularContent) {
|
||||
onChunk?.(newRegularContent);
|
||||
}
|
||||
}
|
||||
|
||||
if (reasoningContent) {
|
||||
hasReceivedData = true;
|
||||
fullReasoningContent += reasoningContent;
|
||||
onReasoningChunk?.(reasoningContent);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error parsing JSON chunk:', e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasReceivedData && fullResponse.length === 0) {
|
||||
const contextError = new Error(
|
||||
'The request exceeds the available context size. Try increasing the context size or enable context shift.'
|
||||
);
|
||||
contextError.name = 'ContextError';
|
||||
onError?.(contextError);
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
const err = error instanceof Error ? error : new Error('Stream error');
|
||||
|
||||
onError?.(err);
|
||||
|
||||
throw err;
|
||||
} finally {
|
||||
reader.releaseLock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles non-streaming response from the chat completion API.
|
||||
* Parses the JSON response and extracts the generated content.
|
||||
*
|
||||
* @param response - The fetch Response object containing the JSON data
|
||||
* @param onComplete - Optional callback invoked when response is successfully parsed
|
||||
* @param onError - Optional callback invoked if an error occurs during parsing
|
||||
* @returns {Promise<string>} Promise that resolves to the generated content string
|
||||
* @throws {Error} if the response cannot be parsed or is malformed
|
||||
*/
|
||||
private async handleNonStreamResponse(
|
||||
response: Response,
|
||||
onComplete?: (
|
||||
response: string,
|
||||
reasoningContent?: string,
|
||||
timings?: ChatMessageTimings
|
||||
) => void,
|
||||
onError?: (error: Error) => void
|
||||
): Promise<string> {
|
||||
try {
|
||||
const responseText = await response.text();
|
||||
|
||||
if (!responseText.trim()) {
|
||||
const contextError = new Error(
|
||||
'The request exceeds the available context size. Try increasing the context size or enable context shift.'
|
||||
);
|
||||
contextError.name = 'ContextError';
|
||||
onError?.(contextError);
|
||||
throw contextError;
|
||||
}
|
||||
|
||||
const data: ApiChatCompletionResponse = JSON.parse(responseText);
|
||||
const content = data.choices[0]?.message?.content || '';
|
||||
const reasoningContent = data.choices[0]?.message?.reasoning_content;
|
||||
|
||||
if (reasoningContent) {
|
||||
console.log('Full reasoning content:', reasoningContent);
|
||||
}
|
||||
|
||||
if (!content.trim()) {
|
||||
const contextError = new Error(
|
||||
'The request exceeds the available context size. Try increasing the context size or enable context shift.'
|
||||
);
|
||||
contextError.name = 'ContextError';
|
||||
onError?.(contextError);
|
||||
throw contextError;
|
||||
}
|
||||
|
||||
onComplete?.(content, reasoningContent);
|
||||
|
||||
return content;
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.name === 'ContextError') {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const err = error instanceof Error ? error : new Error('Parse error');
|
||||
|
||||
onError?.(err);
|
||||
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a database message with attachments to API chat message format.
|
||||
* Processes various attachment types (images, text files, PDFs) and formats them
|
||||
* as content parts suitable for the chat completion API.
|
||||
*
|
||||
* @param message - Database message object with optional extra attachments
|
||||
* @param message.content - The text content of the message
|
||||
* @param message.role - The role of the message sender (user, assistant, system)
|
||||
* @param message.extra - Optional array of message attachments (images, files, etc.)
|
||||
* @returns {ApiChatMessageData} object formatted for the chat completion API
|
||||
* @static
|
||||
*/
|
||||
static convertMessageToChatServiceData(
|
||||
message: DatabaseMessage & { extra?: DatabaseMessageExtra[] }
|
||||
): ApiChatMessageData {
|
||||
if (!message.extra || message.extra.length === 0) {
|
||||
return {
|
||||
role: message.role as 'user' | 'assistant' | 'system',
|
||||
content: message.content
|
||||
};
|
||||
}
|
||||
|
||||
const contentParts: ApiChatMessageContentPart[] = [];
|
||||
|
||||
if (message.content) {
|
||||
contentParts.push({
|
||||
type: 'text',
|
||||
text: message.content
|
||||
});
|
||||
}
|
||||
|
||||
const imageFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
|
||||
extra.type === 'imageFile'
|
||||
);
|
||||
|
||||
for (const image of imageFiles) {
|
||||
contentParts.push({
|
||||
type: 'image_url',
|
||||
image_url: { url: image.base64Url }
|
||||
});
|
||||
}
|
||||
|
||||
const textFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraTextFile =>
|
||||
extra.type === 'textFile'
|
||||
);
|
||||
|
||||
for (const textFile of textFiles) {
|
||||
contentParts.push({
|
||||
type: 'text',
|
||||
text: `\n\n--- File: ${textFile.name} ---\n${textFile.content}`
|
||||
});
|
||||
}
|
||||
|
||||
const audioFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraAudioFile =>
|
||||
extra.type === 'audioFile'
|
||||
);
|
||||
|
||||
for (const audio of audioFiles) {
|
||||
contentParts.push({
|
||||
type: 'input_audio',
|
||||
input_audio: {
|
||||
data: audio.base64Data,
|
||||
format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const pdfFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraPdfFile =>
|
||||
extra.type === 'pdfFile'
|
||||
);
|
||||
|
||||
for (const pdfFile of pdfFiles) {
|
||||
if (pdfFile.processedAsImages && pdfFile.images) {
|
||||
for (let i = 0; i < pdfFile.images.length; i++) {
|
||||
contentParts.push({
|
||||
type: 'image_url',
|
||||
image_url: { url: pdfFile.images[i] }
|
||||
});
|
||||
}
|
||||
} else {
|
||||
contentParts.push({
|
||||
type: 'text',
|
||||
text: `\n\n--- PDF File: ${pdfFile.name} ---\n${pdfFile.content}`
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
role: message.role as 'user' | 'assistant' | 'system',
|
||||
content: contentParts
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get server properties - static method for API compatibility
|
||||
*/
|
||||
static async getServerProps(): Promise<ApiLlamaCppServerProps> {
|
||||
try {
|
||||
const currentConfig = config();
|
||||
const apiKey = currentConfig.apiKey?.toString().trim();
|
||||
|
||||
const response = await fetch(`/props`, {
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch server props: ${response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error('Error fetching server props:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes content to separate thinking tags from regular content.
|
||||
* Parses <think> and </think> tags to route content to appropriate handlers.
|
||||
*
|
||||
* @param content - The content string to process
|
||||
* @param currentInsideThinkTag - Current state of whether we're inside a think tag
|
||||
* @param addThinkContent - Callback to handle content inside think tags
|
||||
* @param addRegularContent - Callback to handle regular content outside think tags
|
||||
* @returns Boolean indicating if we're still inside a think tag after processing
|
||||
* @private
|
||||
*/
|
||||
private processContentForThinkTags(
|
||||
content: string,
|
||||
currentInsideThinkTag: boolean,
|
||||
addThinkContent: (chunk: string) => void,
|
||||
addRegularContent: (chunk: string) => void
|
||||
): boolean {
|
||||
let i = 0;
|
||||
let insideThinkTag = currentInsideThinkTag;
|
||||
|
||||
while (i < content.length) {
|
||||
if (!insideThinkTag && content.substring(i, i + 7) === '<think>') {
|
||||
insideThinkTag = true;
|
||||
i += 7; // Skip the <think> tag
|
||||
continue;
|
||||
}
|
||||
|
||||
if (insideThinkTag && content.substring(i, i + 8) === '</think>') {
|
||||
insideThinkTag = false;
|
||||
i += 8; // Skip the </think> tag
|
||||
continue;
|
||||
}
|
||||
|
||||
if (insideThinkTag) {
|
||||
addThinkContent(content[i]);
|
||||
} else {
|
||||
addRegularContent(content[i]);
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
return insideThinkTag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Aborts any ongoing chat completion request.
|
||||
* Cancels the current request and cleans up the abort controller.
|
||||
*
|
||||
* @public
|
||||
*/
|
||||
public abort(): void {
|
||||
if (this.abortController) {
|
||||
this.abortController.abort();
|
||||
this.abortController = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects a system message at the beginning of the conversation if configured in settings.
|
||||
* Checks for existing system messages to avoid duplication and retrieves the system message
|
||||
* from the current configuration settings.
|
||||
*
|
||||
* @param messages - Array of chat messages to process
|
||||
* @returns Array of messages with system message injected at the beginning if configured
|
||||
* @private
|
||||
*/
|
||||
private injectSystemMessage(messages: ApiChatMessageData[]): ApiChatMessageData[] {
|
||||
const currentConfig = config();
|
||||
const systemMessage = currentConfig.systemMessage?.toString().trim();
|
||||
|
||||
if (!systemMessage) {
|
||||
return messages;
|
||||
}
|
||||
|
||||
if (messages.length > 0 && messages[0].role === 'system') {
|
||||
if (messages[0].content !== systemMessage) {
|
||||
const updatedMessages = [...messages];
|
||||
updatedMessages[0] = {
|
||||
role: 'system',
|
||||
content: systemMessage
|
||||
};
|
||||
return updatedMessages;
|
||||
}
|
||||
|
||||
return messages;
|
||||
}
|
||||
|
||||
const systemMsg: ApiChatMessageData = {
|
||||
role: 'system',
|
||||
content: systemMessage
|
||||
};
|
||||
|
||||
return [systemMsg, ...messages];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses error response and creates appropriate error with context information
|
||||
* @param response - HTTP response object
|
||||
* @returns Promise<Error> - Parsed error with context info if available
|
||||
*/
|
||||
private async parseErrorResponse(response: Response): Promise<Error> {
|
||||
try {
|
||||
const errorText = await response.text();
|
||||
const errorData: ApiErrorResponse = JSON.parse(errorText);
|
||||
|
||||
if (errorData.error?.type === 'exceed_context_size_error') {
|
||||
const contextError = errorData.error as ApiContextSizeError;
|
||||
const error = new Error(contextError.message);
|
||||
error.name = 'ContextError';
|
||||
// Attach structured context information
|
||||
(
|
||||
error as Error & {
|
||||
contextInfo?: { promptTokens: number; maxContext: number; estimatedTokens: number };
|
||||
}
|
||||
).contextInfo = {
|
||||
promptTokens: contextError.n_prompt_tokens,
|
||||
maxContext: contextError.n_ctx,
|
||||
estimatedTokens: contextError.n_prompt_tokens
|
||||
};
|
||||
return error;
|
||||
}
|
||||
|
||||
// Fallback for other error types
|
||||
const message = errorData.error?.message || 'Unknown server error';
|
||||
return new Error(message);
|
||||
} catch {
|
||||
// If we can't parse the error response, return a generic error
|
||||
return new Error(`Server error (${response.status}): ${response.statusText}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the processing state with timing information from the server response
|
||||
* @param timings - Timing data from the API response
|
||||
* @param promptProgress - Progress data from the API response
|
||||
*/
|
||||
private updateProcessingState(
|
||||
timings?: ChatMessageTimings,
|
||||
promptProgress?: ChatMessagePromptProgress
|
||||
): void {
|
||||
// Calculate tokens per second from timing data
|
||||
const tokensPerSecond =
|
||||
timings?.predicted_ms && timings?.predicted_n
|
||||
? (timings.predicted_n / timings.predicted_ms) * 1000
|
||||
: 0;
|
||||
|
||||
// Update slots service with timing data (async but don't wait)
|
||||
slotsService
|
||||
.updateFromTimingData({
|
||||
prompt_n: timings?.prompt_n || 0,
|
||||
predicted_n: timings?.predicted_n || 0,
|
||||
predicted_per_second: tokensPerSecond,
|
||||
cache_n: timings?.cache_n || 0,
|
||||
prompt_progress: promptProgress
|
||||
})
|
||||
.catch((error) => {
|
||||
console.warn('Failed to update processing state:', error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export const chatService = new ChatService();
|
||||
102
tools/server/webui/src/lib/services/context.ts
Normal file
102
tools/server/webui/src/lib/services/context.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
import { slotsService } from './slots';
|
||||
|
||||
export interface ContextCheckResult {
|
||||
wouldExceed: boolean;
|
||||
currentUsage: number;
|
||||
maxContext: number;
|
||||
availableTokens: number;
|
||||
reservedTokens: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* ContextService - Context window management and limit checking
|
||||
*
|
||||
* This service provides context window monitoring and limit checking using real-time
|
||||
* server data from the slots service. It helps prevent context overflow by tracking
|
||||
* current usage and calculating available space for new content.
|
||||
*
|
||||
* **Architecture & Relationships:**
|
||||
* - **ContextService** (this class): Context limit monitoring
|
||||
* - Uses SlotsService for real-time context usage data
|
||||
* - Calculates available tokens with configurable reserves
|
||||
* - Provides context limit checking and error messaging
|
||||
* - Helps prevent context window overflow
|
||||
*
|
||||
* - **SlotsService**: Provides current context usage from server slots
|
||||
* - **ChatStore**: Uses context checking before sending messages
|
||||
* - **UI Components**: Display context usage warnings and limits
|
||||
*
|
||||
* **Key Features:**
|
||||
* - **Real-time Context Checking**: Uses live server data for accuracy
|
||||
* - **Token Reservation**: Reserves tokens for response generation
|
||||
* - **Limit Detection**: Prevents context window overflow
|
||||
* - **Usage Reporting**: Detailed context usage statistics
|
||||
* - **Error Messaging**: User-friendly context limit messages
|
||||
* - **Configurable Reserves**: Adjustable token reservation for responses
|
||||
*
|
||||
* **Context Management:**
|
||||
* - Monitors current context usage from active slots
|
||||
* - Calculates available space considering reserved tokens
|
||||
* - Provides early warning before context limits are reached
|
||||
* - Helps optimize conversation length and content
|
||||
*/
|
||||
export class ContextService {
|
||||
private reserveTokens: number;
|
||||
|
||||
constructor(reserveTokens = 512) {
|
||||
this.reserveTokens = reserveTokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the context limit would be exceeded
|
||||
*
|
||||
* @returns {Promise<ContextCheckResult | null>} Promise that resolves to the context check result or null if an error occurs
|
||||
*/
|
||||
async checkContextLimit(): Promise<ContextCheckResult | null> {
|
||||
try {
|
||||
const currentState = await slotsService.getCurrentState();
|
||||
|
||||
if (!currentState) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const maxContext = currentState.contextTotal;
|
||||
const currentUsage = currentState.contextUsed;
|
||||
const availableTokens = maxContext - currentUsage - this.reserveTokens;
|
||||
const wouldExceed = availableTokens <= 0;
|
||||
|
||||
return {
|
||||
wouldExceed,
|
||||
currentUsage,
|
||||
maxContext,
|
||||
availableTokens: Math.max(0, availableTokens),
|
||||
reservedTokens: this.reserveTokens
|
||||
};
|
||||
} catch (error) {
|
||||
console.warn('Error checking context limit:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a formatted error message for context limit exceeded
|
||||
*
|
||||
* @param {ContextCheckResult} result - Context check result
|
||||
* @returns {string} Formatted error message
|
||||
*/
|
||||
getContextErrorMessage(result: ContextCheckResult): string {
|
||||
const usagePercent = Math.round((result.currentUsage / result.maxContext) * 100);
|
||||
return `Context window is nearly full. Current usage: ${result.currentUsage.toLocaleString()}/${result.maxContext.toLocaleString()} tokens (${usagePercent}%). Available space: ${result.availableTokens.toLocaleString()} tokens (${result.reservedTokens} reserved for response).`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the number of tokens to reserve for response generation
|
||||
*
|
||||
* @param {number} tokens - Number of tokens to reserve
|
||||
*/
|
||||
setReserveTokens(tokens: number): void {
|
||||
this.reserveTokens = tokens;
|
||||
}
|
||||
}
|
||||
|
||||
export const contextService = new ContextService();
|
||||
3
tools/server/webui/src/lib/services/index.ts
Normal file
3
tools/server/webui/src/lib/services/index.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
export { chatService } from './chat';
|
||||
export { contextService } from './context';
|
||||
export { slotsService } from './slots';
|
||||
254
tools/server/webui/src/lib/services/slots.ts
Normal file
254
tools/server/webui/src/lib/services/slots.ts
Normal file
@@ -0,0 +1,254 @@
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
|
||||
/**
|
||||
* SlotsService - Real-time processing state monitoring and token rate calculation
|
||||
*
|
||||
* This service provides real-time information about generation progress, token rates,
|
||||
* and context usage based on timing data from ChatService streaming responses.
|
||||
* It manages streaming session tracking and provides accurate processing state updates.
|
||||
*
|
||||
* **Architecture & Relationships:**
|
||||
* - **SlotsService** (this class): Processing state monitoring
|
||||
* - Receives timing data from ChatService streaming responses
|
||||
* - Calculates token generation rates and context usage
|
||||
* - Manages streaming session lifecycle
|
||||
* - Provides real-time updates to UI components
|
||||
*
|
||||
* - **ChatService**: Provides timing data from `/chat/completions` streaming
|
||||
* - **UI Components**: Subscribe to processing state for progress indicators
|
||||
*
|
||||
* **Key Features:**
|
||||
* - **Real-time Monitoring**: Live processing state during generation
|
||||
* - **Token Rate Calculation**: Accurate tokens/second from timing data
|
||||
* - **Context Tracking**: Current context usage and remaining capacity
|
||||
* - **Streaming Lifecycle**: Start/stop tracking for streaming sessions
|
||||
* - **Timing Data Processing**: Converts streaming timing data to structured state
|
||||
* - **Error Handling**: Graceful handling when timing data is unavailable
|
||||
*
|
||||
* **Processing States:**
|
||||
* - `idle`: No active processing
|
||||
* - `generating`: Actively generating tokens
|
||||
*
|
||||
* **Token Rate Calculation:**
|
||||
* Uses timing data from `/chat/completions` streaming response for accurate
|
||||
* real-time token generation rate measurement.
|
||||
*/
|
||||
export class SlotsService {
|
||||
private callbacks: Set<(state: ApiProcessingState | null) => void> = new Set();
|
||||
private isStreamingActive: boolean = false;
|
||||
private lastKnownState: ApiProcessingState | null = null;
|
||||
|
||||
/**
|
||||
* Start streaming session tracking
|
||||
*/
|
||||
startStreaming(): void {
|
||||
this.isStreamingActive = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop streaming session tracking
|
||||
*/
|
||||
stopStreaming(): void {
|
||||
this.isStreamingActive = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the current processing state
|
||||
* Used when switching to a conversation without timing data
|
||||
*/
|
||||
clearState(): void {
|
||||
this.lastKnownState = null;
|
||||
|
||||
for (const callback of this.callbacks) {
|
||||
try {
|
||||
callback(null);
|
||||
} catch (error) {
|
||||
console.error('Error in clearState callback:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if currently in a streaming session
|
||||
*/
|
||||
isStreaming(): boolean {
|
||||
return this.isStreamingActive;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Polling is no longer used - timing data comes from ChatService streaming response
|
||||
* This method logs a warning if called to help identify outdated usage
|
||||
*/
|
||||
fetchAndNotify(): void {
|
||||
console.warn(
|
||||
'SlotsService.fetchAndNotify() is deprecated - use timing data from ChatService instead'
|
||||
);
|
||||
}
|
||||
|
||||
subscribe(callback: (state: ApiProcessingState | null) => void): () => void {
|
||||
this.callbacks.add(callback);
|
||||
|
||||
if (this.lastKnownState) {
|
||||
callback(this.lastKnownState);
|
||||
}
|
||||
|
||||
return () => {
|
||||
this.callbacks.delete(callback);
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates processing state with timing data from ChatService streaming response
|
||||
*/
|
||||
async updateFromTimingData(timingData: {
|
||||
prompt_n: number;
|
||||
predicted_n: number;
|
||||
predicted_per_second: number;
|
||||
cache_n: number;
|
||||
prompt_progress?: ChatMessagePromptProgress;
|
||||
}): Promise<void> {
|
||||
const processingState = await this.parseCompletionTimingData(timingData);
|
||||
|
||||
// Only update if we successfully parsed the state
|
||||
if (processingState === null) {
|
||||
console.warn('Failed to parse timing data - skipping update');
|
||||
return;
|
||||
}
|
||||
|
||||
this.lastKnownState = processingState;
|
||||
|
||||
for (const callback of this.callbacks) {
|
||||
try {
|
||||
callback(processingState);
|
||||
} catch (error) {
|
||||
console.error('Error in timing callback:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets context total from last known slots data or fetches from server
|
||||
*/
|
||||
private async getContextTotal(): Promise<number | null> {
|
||||
if (this.lastKnownState && this.lastKnownState.contextTotal > 0) {
|
||||
return this.lastKnownState.contextTotal;
|
||||
}
|
||||
|
||||
try {
|
||||
const currentConfig = config();
|
||||
const apiKey = currentConfig.apiKey?.toString().trim();
|
||||
|
||||
const response = await fetch('/slots', {
|
||||
headers: {
|
||||
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
|
||||
}
|
||||
});
|
||||
if (response.ok) {
|
||||
const slotsData = await response.json();
|
||||
if (Array.isArray(slotsData) && slotsData.length > 0) {
|
||||
const slot = slotsData[0];
|
||||
if (slot.n_ctx && slot.n_ctx > 0) {
|
||||
return slot.n_ctx;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Failed to fetch context total from /slots:', error);
|
||||
}
|
||||
|
||||
return 4096;
|
||||
}
|
||||
|
||||
private async parseCompletionTimingData(
|
||||
timingData: Record<string, unknown>
|
||||
): Promise<ApiProcessingState | null> {
|
||||
const promptTokens = (timingData.prompt_n as number) || 0;
|
||||
const predictedTokens = (timingData.predicted_n as number) || 0;
|
||||
const tokensPerSecond = (timingData.predicted_per_second as number) || 0;
|
||||
const cacheTokens = (timingData.cache_n as number) || 0;
|
||||
const promptProgress = timingData.prompt_progress as
|
||||
| {
|
||||
total: number;
|
||||
cache: number;
|
||||
processed: number;
|
||||
time_ms: number;
|
||||
}
|
||||
| undefined;
|
||||
|
||||
const contextTotal = await this.getContextTotal();
|
||||
|
||||
if (contextTotal === null) {
|
||||
console.warn('No context total available - cannot calculate processing state');
|
||||
return null;
|
||||
}
|
||||
|
||||
const currentConfig = config();
|
||||
const outputTokensMax = currentConfig.max_tokens || -1;
|
||||
|
||||
const contextUsed = promptTokens + cacheTokens + predictedTokens;
|
||||
const outputTokensUsed = predictedTokens;
|
||||
|
||||
const progressPercent = promptProgress
|
||||
? Math.round((promptProgress.processed / promptProgress.total) * 100)
|
||||
: undefined;
|
||||
|
||||
return {
|
||||
status: predictedTokens > 0 ? 'generating' : promptProgress ? 'preparing' : 'idle',
|
||||
tokensDecoded: predictedTokens,
|
||||
tokensRemaining: outputTokensMax - predictedTokens,
|
||||
contextUsed,
|
||||
contextTotal,
|
||||
outputTokensUsed,
|
||||
outputTokensMax,
|
||||
hasNextToken: predictedTokens > 0,
|
||||
tokensPerSecond,
|
||||
temperature: currentConfig.temperature ?? 0.8,
|
||||
topP: currentConfig.top_p ?? 0.95,
|
||||
speculative: false,
|
||||
progressPercent,
|
||||
promptTokens,
|
||||
cacheTokens
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current processing state
|
||||
* Returns the last known state from timing data, or null if no data available
|
||||
*/
|
||||
async getCurrentState(): Promise<ApiProcessingState | null> {
|
||||
if (this.lastKnownState) {
|
||||
return this.lastKnownState;
|
||||
}
|
||||
try {
|
||||
// Import dynamically to avoid circular dependency
|
||||
const { chatStore } = await import('$lib/stores/chat.svelte');
|
||||
const messages = chatStore.activeMessages;
|
||||
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
const message = messages[i];
|
||||
if (message.role === 'assistant' && message.timings) {
|
||||
const restoredState = await this.parseCompletionTimingData({
|
||||
prompt_n: message.timings.prompt_n || 0,
|
||||
predicted_n: message.timings.predicted_n || 0,
|
||||
predicted_per_second:
|
||||
message.timings.predicted_n && message.timings.predicted_ms
|
||||
? (message.timings.predicted_n / message.timings.predicted_ms) * 1000
|
||||
: 0,
|
||||
cache_n: message.timings.cache_n || 0
|
||||
});
|
||||
|
||||
if (restoredState) {
|
||||
this.lastKnownState = restoredState;
|
||||
return restoredState;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Failed to restore timing data from messages:', error);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export const slotsService = new SlotsService();
|
||||
Reference in New Issue
Block a user