* Switched web UI to hash-based routing * Added hash to missed goto function call * Removed outdated SPA handling code * Fixed broken sidebar home link
255 lines
7.3 KiB
TypeScript
255 lines
7.3 KiB
TypeScript
import { config } from '$lib/stores/settings.svelte';
|
|
|
|
/**
|
|
* SlotsService - Real-time processing state monitoring and token rate calculation
|
|
*
|
|
* This service provides real-time information about generation progress, token rates,
|
|
* and context usage based on timing data from ChatService streaming responses.
|
|
* It manages streaming session tracking and provides accurate processing state updates.
|
|
*
|
|
* **Architecture & Relationships:**
|
|
* - **SlotsService** (this class): Processing state monitoring
|
|
* - Receives timing data from ChatService streaming responses
|
|
* - Calculates token generation rates and context usage
|
|
* - Manages streaming session lifecycle
|
|
* - Provides real-time updates to UI components
|
|
*
|
|
* - **ChatService**: Provides timing data from `/chat/completions` streaming
|
|
* - **UI Components**: Subscribe to processing state for progress indicators
|
|
*
|
|
* **Key Features:**
|
|
* - **Real-time Monitoring**: Live processing state during generation
|
|
* - **Token Rate Calculation**: Accurate tokens/second from timing data
|
|
* - **Context Tracking**: Current context usage and remaining capacity
|
|
* - **Streaming Lifecycle**: Start/stop tracking for streaming sessions
|
|
* - **Timing Data Processing**: Converts streaming timing data to structured state
|
|
* - **Error Handling**: Graceful handling when timing data is unavailable
|
|
*
|
|
* **Processing States:**
|
|
* - `idle`: No active processing
|
|
* - `generating`: Actively generating tokens
|
|
*
|
|
* **Token Rate Calculation:**
|
|
* Uses timing data from `/chat/completions` streaming response for accurate
|
|
* real-time token generation rate measurement.
|
|
*/
|
|
export class SlotsService {
|
|
private callbacks: Set<(state: ApiProcessingState | null) => void> = new Set();
|
|
private isStreamingActive: boolean = false;
|
|
private lastKnownState: ApiProcessingState | null = null;
|
|
|
|
/**
|
|
* Start streaming session tracking
|
|
*/
|
|
startStreaming(): void {
|
|
this.isStreamingActive = true;
|
|
}
|
|
|
|
/**
|
|
* Stop streaming session tracking
|
|
*/
|
|
stopStreaming(): void {
|
|
this.isStreamingActive = false;
|
|
}
|
|
|
|
/**
|
|
* Clear the current processing state
|
|
* Used when switching to a conversation without timing data
|
|
*/
|
|
clearState(): void {
|
|
this.lastKnownState = null;
|
|
|
|
for (const callback of this.callbacks) {
|
|
try {
|
|
callback(null);
|
|
} catch (error) {
|
|
console.error('Error in clearState callback:', error);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if currently in a streaming session
|
|
*/
|
|
isStreaming(): boolean {
|
|
return this.isStreamingActive;
|
|
}
|
|
|
|
/**
|
|
* @deprecated Polling is no longer used - timing data comes from ChatService streaming response
|
|
* This method logs a warning if called to help identify outdated usage
|
|
*/
|
|
fetchAndNotify(): void {
|
|
console.warn(
|
|
'SlotsService.fetchAndNotify() is deprecated - use timing data from ChatService instead'
|
|
);
|
|
}
|
|
|
|
subscribe(callback: (state: ApiProcessingState | null) => void): () => void {
|
|
this.callbacks.add(callback);
|
|
|
|
if (this.lastKnownState) {
|
|
callback(this.lastKnownState);
|
|
}
|
|
|
|
return () => {
|
|
this.callbacks.delete(callback);
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Updates processing state with timing data from ChatService streaming response
|
|
*/
|
|
async updateFromTimingData(timingData: {
|
|
prompt_n: number;
|
|
predicted_n: number;
|
|
predicted_per_second: number;
|
|
cache_n: number;
|
|
prompt_progress?: ChatMessagePromptProgress;
|
|
}): Promise<void> {
|
|
const processingState = await this.parseCompletionTimingData(timingData);
|
|
|
|
// Only update if we successfully parsed the state
|
|
if (processingState === null) {
|
|
console.warn('Failed to parse timing data - skipping update');
|
|
return;
|
|
}
|
|
|
|
this.lastKnownState = processingState;
|
|
|
|
for (const callback of this.callbacks) {
|
|
try {
|
|
callback(processingState);
|
|
} catch (error) {
|
|
console.error('Error in timing callback:', error);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets context total from last known slots data or fetches from server
|
|
*/
|
|
private async getContextTotal(): Promise<number | null> {
|
|
if (this.lastKnownState && this.lastKnownState.contextTotal > 0) {
|
|
return this.lastKnownState.contextTotal;
|
|
}
|
|
|
|
try {
|
|
const currentConfig = config();
|
|
const apiKey = currentConfig.apiKey?.toString().trim();
|
|
|
|
const response = await fetch(`./slots`, {
|
|
headers: {
|
|
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
|
|
}
|
|
});
|
|
if (response.ok) {
|
|
const slotsData = await response.json();
|
|
if (Array.isArray(slotsData) && slotsData.length > 0) {
|
|
const slot = slotsData[0];
|
|
if (slot.n_ctx && slot.n_ctx > 0) {
|
|
return slot.n_ctx;
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.warn('Failed to fetch context total from /slots:', error);
|
|
}
|
|
|
|
return 4096;
|
|
}
|
|
|
|
private async parseCompletionTimingData(
|
|
timingData: Record<string, unknown>
|
|
): Promise<ApiProcessingState | null> {
|
|
const promptTokens = (timingData.prompt_n as number) || 0;
|
|
const predictedTokens = (timingData.predicted_n as number) || 0;
|
|
const tokensPerSecond = (timingData.predicted_per_second as number) || 0;
|
|
const cacheTokens = (timingData.cache_n as number) || 0;
|
|
const promptProgress = timingData.prompt_progress as
|
|
| {
|
|
total: number;
|
|
cache: number;
|
|
processed: number;
|
|
time_ms: number;
|
|
}
|
|
| undefined;
|
|
|
|
const contextTotal = await this.getContextTotal();
|
|
|
|
if (contextTotal === null) {
|
|
console.warn('No context total available - cannot calculate processing state');
|
|
return null;
|
|
}
|
|
|
|
const currentConfig = config();
|
|
const outputTokensMax = currentConfig.max_tokens || -1;
|
|
|
|
const contextUsed = promptTokens + cacheTokens + predictedTokens;
|
|
const outputTokensUsed = predictedTokens;
|
|
|
|
const progressPercent = promptProgress
|
|
? Math.round((promptProgress.processed / promptProgress.total) * 100)
|
|
: undefined;
|
|
|
|
return {
|
|
status: predictedTokens > 0 ? 'generating' : promptProgress ? 'preparing' : 'idle',
|
|
tokensDecoded: predictedTokens,
|
|
tokensRemaining: outputTokensMax - predictedTokens,
|
|
contextUsed,
|
|
contextTotal,
|
|
outputTokensUsed,
|
|
outputTokensMax,
|
|
hasNextToken: predictedTokens > 0,
|
|
tokensPerSecond,
|
|
temperature: currentConfig.temperature ?? 0.8,
|
|
topP: currentConfig.top_p ?? 0.95,
|
|
speculative: false,
|
|
progressPercent,
|
|
promptTokens,
|
|
cacheTokens
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get current processing state
|
|
* Returns the last known state from timing data, or null if no data available
|
|
*/
|
|
async getCurrentState(): Promise<ApiProcessingState | null> {
|
|
if (this.lastKnownState) {
|
|
return this.lastKnownState;
|
|
}
|
|
try {
|
|
// Import dynamically to avoid circular dependency
|
|
const { chatStore } = await import('$lib/stores/chat.svelte');
|
|
const messages = chatStore.activeMessages;
|
|
|
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
const message = messages[i];
|
|
if (message.role === 'assistant' && message.timings) {
|
|
const restoredState = await this.parseCompletionTimingData({
|
|
prompt_n: message.timings.prompt_n || 0,
|
|
predicted_n: message.timings.predicted_n || 0,
|
|
predicted_per_second:
|
|
message.timings.predicted_n && message.timings.predicted_ms
|
|
? (message.timings.predicted_n / message.timings.predicted_ms) * 1000
|
|
: 0,
|
|
cache_n: message.timings.cache_n || 0
|
|
});
|
|
|
|
if (restoredState) {
|
|
this.lastKnownState = restoredState;
|
|
return restoredState;
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.warn('Failed to restore timing data from messages:', error);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export const slotsService = new SlotsService();
|