[router] create worker removal step and clean up worker manager (#11921)
This commit is contained in:
@@ -15,11 +15,9 @@ use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
config::{RouterConfig, RoutingMode},
|
||||
core::{
|
||||
workflow::{
|
||||
WorkflowContext, WorkflowEngine, WorkflowId, WorkflowInstanceId, WorkflowStatus,
|
||||
},
|
||||
WorkerManager,
|
||||
core::workflow::{
|
||||
steps::WorkerRemovalRequest, WorkflowContext, WorkflowEngine, WorkflowId,
|
||||
WorkflowInstanceId, WorkflowStatus,
|
||||
},
|
||||
metrics::RouterMetrics,
|
||||
protocols::worker_spec::{JobStatus, WorkerConfigRequest},
|
||||
@@ -320,11 +318,29 @@ impl JobQueue {
|
||||
.await
|
||||
}
|
||||
Job::RemoveWorker { url } => {
|
||||
let result = WorkerManager::remove_worker(url, context);
|
||||
let engine = context
|
||||
.workflow_engine
|
||||
.get()
|
||||
.ok_or_else(|| "Workflow engine not initialized".to_string())?;
|
||||
|
||||
let instance_id = Self::start_worker_removal_workflow(engine, url, context).await?;
|
||||
|
||||
debug!(
|
||||
"Started worker removal workflow for {} (instance: {})",
|
||||
url, instance_id
|
||||
);
|
||||
|
||||
let timeout_duration = Duration::from_secs(30);
|
||||
|
||||
let result =
|
||||
Self::wait_for_workflow_completion(engine, instance_id, url, timeout_duration)
|
||||
.await;
|
||||
|
||||
// Clean up job status when removing worker
|
||||
if let Some(queue) = context.worker_job_queue.get() {
|
||||
queue.remove_status(url);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
Job::InitializeWorkersFromConfig { router_config } => {
|
||||
@@ -424,6 +440,27 @@ impl JobQueue {
|
||||
.map_err(|e| format!("Failed to start worker registration workflow: {:?}", e))
|
||||
}
|
||||
|
||||
/// Start worker removal workflow
|
||||
async fn start_worker_removal_workflow(
|
||||
engine: &Arc<WorkflowEngine>,
|
||||
url: &str,
|
||||
context: &Arc<AppContext>,
|
||||
) -> Result<WorkflowInstanceId, String> {
|
||||
let removal_request = WorkerRemovalRequest {
|
||||
url: url.to_string(),
|
||||
dp_aware: context.router_config.dp_aware,
|
||||
};
|
||||
|
||||
let mut workflow_context = WorkflowContext::new(WorkflowInstanceId::new());
|
||||
workflow_context.set("removal_request", removal_request);
|
||||
workflow_context.set_arc("app_context", Arc::clone(context));
|
||||
|
||||
engine
|
||||
.start_workflow(WorkflowId::new("worker_removal"), workflow_context)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to start worker removal workflow: {:?}", e))
|
||||
}
|
||||
|
||||
/// Wait for workflow completion with adaptive polling
|
||||
async fn wait_for_workflow_completion(
|
||||
engine: &Arc<WorkflowEngine>,
|
||||
|
||||
@@ -29,5 +29,5 @@ pub use worker::{
|
||||
Worker, WorkerFactory, WorkerLoadGuard, WorkerType,
|
||||
};
|
||||
pub use worker_builder::{BasicWorkerBuilder, DPAwareWorkerBuilder};
|
||||
pub use worker_manager::{DpInfo, LoadMonitor, ServerInfo, WorkerManager};
|
||||
pub use worker_manager::{LoadMonitor, WorkerManager};
|
||||
pub use worker_registry::{WorkerId, WorkerRegistry, WorkerRegistryStats};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,5 +14,5 @@ pub use engine::WorkflowEngine;
|
||||
pub use event::{EventBus, EventSubscriber, LoggingSubscriber, WorkflowEvent};
|
||||
pub use executor::{FunctionStep, StepExecutor};
|
||||
pub use state::WorkflowStateStore;
|
||||
pub use steps::create_worker_registration_workflow;
|
||||
pub use steps::{create_worker_registration_workflow, create_worker_removal_workflow};
|
||||
pub use types::*;
|
||||
|
||||
@@ -2,11 +2,17 @@
|
||||
//!
|
||||
//! This module contains concrete step implementations for various workflows:
|
||||
//! - Worker registration and activation
|
||||
//! - Worker removal
|
||||
//! - Future: Tokenizer fetching, LoRA updates, etc.
|
||||
|
||||
pub mod worker_registration;
|
||||
pub mod worker_removal;
|
||||
|
||||
pub use worker_registration::{
|
||||
create_worker_registration_workflow, ActivateWorkerStep, CreateWorkerStep,
|
||||
DetectConnectionModeStep, DiscoverMetadataStep, RegisterWorkerStep, UpdatePoliciesStep,
|
||||
};
|
||||
pub use worker_removal::{
|
||||
create_worker_removal_workflow, FindWorkersToRemoveStep, RemoveFromPolicyRegistryStep,
|
||||
RemoveFromWorkerRegistryStep, UpdateRemainingPoliciesStep, WorkerRemovalRequest,
|
||||
};
|
||||
|
||||
@@ -16,13 +16,14 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use async_trait::async_trait;
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::{
|
||||
core::{
|
||||
workflow::*, BasicWorkerBuilder, CircuitBreakerConfig, ConnectionMode,
|
||||
DPAwareWorkerBuilder, DpInfo, HealthConfig, Worker, WorkerManager, WorkerType,
|
||||
DPAwareWorkerBuilder, HealthConfig, Worker, WorkerType,
|
||||
},
|
||||
grpc_client::SglangSchedulerClient,
|
||||
protocols::worker_spec::WorkerConfigRequest,
|
||||
@@ -37,6 +38,82 @@ static HTTP_CLIENT: Lazy<Client> = Lazy::new(|| {
|
||||
.expect("Failed to create HTTP client")
|
||||
});
|
||||
|
||||
/// Server information returned from worker endpoints
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
struct ServerInfo {
|
||||
#[serde(alias = "model")]
|
||||
model_id: Option<String>,
|
||||
model_path: Option<String>,
|
||||
dp_size: Option<usize>,
|
||||
version: Option<String>,
|
||||
max_batch_size: Option<usize>,
|
||||
max_total_tokens: Option<usize>,
|
||||
max_prefill_tokens: Option<usize>,
|
||||
max_running_requests: Option<usize>,
|
||||
max_num_reqs: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DpInfo {
|
||||
pub dp_size: usize,
|
||||
pub model_id: String,
|
||||
}
|
||||
|
||||
/// Parse server info from JSON response using serde
|
||||
fn parse_server_info(json: Value) -> Result<ServerInfo, String> {
|
||||
serde_json::from_value(json).map_err(|e| format!("Failed to parse server info: {}", e))
|
||||
}
|
||||
|
||||
/// Get server info from /get_server_info endpoint
|
||||
async fn get_server_info(url: &str, api_key: Option<&str>) -> Result<ServerInfo, String> {
|
||||
let base_url = url.trim_end_matches('/');
|
||||
let server_info_url = format!("{}/get_server_info", base_url);
|
||||
|
||||
let mut req = HTTP_CLIENT.get(&server_info_url);
|
||||
if let Some(key) = api_key {
|
||||
req = req.bearer_auth(key);
|
||||
}
|
||||
|
||||
let response = req
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to connect to {}: {}", server_info_url, e))?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(format!(
|
||||
"Server returned status {} from {}",
|
||||
response.status(),
|
||||
server_info_url
|
||||
));
|
||||
}
|
||||
|
||||
let json = response
|
||||
.json::<Value>()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to parse response from {}: {}", server_info_url, e))?;
|
||||
|
||||
parse_server_info(json)
|
||||
}
|
||||
|
||||
/// Get DP info for a worker URL
|
||||
async fn get_dp_info(url: &str, api_key: Option<&str>) -> Result<DpInfo, String> {
|
||||
let info = get_server_info(url, api_key).await?;
|
||||
|
||||
let dp_size = info
|
||||
.dp_size
|
||||
.ok_or_else(|| format!("No dp_size in response from {}", url))?;
|
||||
|
||||
let model_id = info
|
||||
.model_id
|
||||
.or_else(|| {
|
||||
info.model_path
|
||||
.and_then(|path| path.split('/').next_back().map(|s| s.to_string()))
|
||||
})
|
||||
.unwrap_or_else(|| "unknown".to_string());
|
||||
|
||||
Ok(DpInfo { dp_size, model_id })
|
||||
}
|
||||
|
||||
/// Helper: Strip protocol prefix from URL
|
||||
fn strip_protocol(url: &str) -> String {
|
||||
url.trim_start_matches("http://")
|
||||
@@ -83,49 +160,6 @@ async fn try_grpc_health_check(url: &str, timeout_secs: u64) -> Result<(), Strin
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper: Fetch HTTP metadata
|
||||
async fn fetch_http_metadata(
|
||||
url: &str,
|
||||
api_key: Option<&str>,
|
||||
) -> Result<HashMap<String, String>, String> {
|
||||
let clean_url = strip_protocol(url);
|
||||
let info_url = if clean_url.starts_with("http://") || clean_url.starts_with("https://") {
|
||||
format!("{}/get_server_info", clean_url)
|
||||
} else {
|
||||
format!("http://{}/get_server_info", clean_url)
|
||||
};
|
||||
|
||||
let mut request = HTTP_CLIENT.get(&info_url);
|
||||
if let Some(key) = api_key {
|
||||
request = request.header("Authorization", format!("Bearer {}", key));
|
||||
}
|
||||
|
||||
let response = request
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to fetch HTTP metadata: {}", e))?;
|
||||
|
||||
let server_info: Value = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to parse HTTP metadata: {}", e))?;
|
||||
|
||||
let mut labels = HashMap::new();
|
||||
|
||||
if let Some(model_path) = server_info.get("model_path").and_then(|v| v.as_str()) {
|
||||
if !model_path.is_empty() {
|
||||
labels.insert("model_path".to_string(), model_path.to_string());
|
||||
}
|
||||
}
|
||||
if let Some(tokenizer_path) = server_info.get("tokenizer_path").and_then(|v| v.as_str()) {
|
||||
if !tokenizer_path.is_empty() {
|
||||
labels.insert("tokenizer_path".to_string(), tokenizer_path.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(labels)
|
||||
}
|
||||
|
||||
/// Helper: Fetch gRPC metadata
|
||||
async fn fetch_grpc_metadata(url: &str) -> Result<HashMap<String, String>, String> {
|
||||
let grpc_url = if url.starts_with("grpc://") {
|
||||
@@ -266,7 +300,18 @@ impl StepExecutor for DiscoverMetadataStep {
|
||||
|
||||
let discovered_labels = match connection_mode.as_ref() {
|
||||
ConnectionMode::Http => {
|
||||
fetch_http_metadata(&config.url, config.api_key.as_deref()).await
|
||||
match get_server_info(&config.url, config.api_key.as_deref()).await {
|
||||
Ok(server_info) => {
|
||||
let mut labels = HashMap::new();
|
||||
if let Some(model_path) = server_info.model_path {
|
||||
if !model_path.is_empty() {
|
||||
labels.insert("model_path".to_string(), model_path);
|
||||
}
|
||||
}
|
||||
Ok(labels)
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
ConnectionMode::Grpc { .. } => fetch_grpc_metadata(&config.url).await,
|
||||
}
|
||||
@@ -314,7 +359,7 @@ impl StepExecutor for DiscoverDPInfoStep {
|
||||
debug!("Discovering DP info for {} (DP-aware)", config.url);
|
||||
|
||||
// Get DP info from worker
|
||||
let dp_info = WorkerManager::get_dp_info(&config.url, config.api_key.as_deref())
|
||||
let dp_info = get_dp_info(&config.url, config.api_key.as_deref())
|
||||
.await
|
||||
.map_err(|e| WorkflowError::StepFailed {
|
||||
step_id: StepId::new("discover_dp_info"),
|
||||
@@ -327,7 +372,7 @@ impl StepExecutor for DiscoverDPInfoStep {
|
||||
);
|
||||
|
||||
// Store DP info in context
|
||||
context.set("dp_info", Arc::new(dp_info));
|
||||
context.set("dp_info", dp_info);
|
||||
|
||||
Ok(StepResult::Success)
|
||||
}
|
||||
@@ -522,7 +567,7 @@ impl StepExecutor for CreateWorkerStep {
|
||||
}
|
||||
|
||||
// Store workers (plural) and labels in context
|
||||
context.set("workers", Arc::new(workers));
|
||||
context.set("workers", workers);
|
||||
context.set("labels", final_labels);
|
||||
|
||||
Ok(StepResult::Success)
|
||||
@@ -595,7 +640,7 @@ impl StepExecutor for RegisterWorkerStep {
|
||||
);
|
||||
}
|
||||
|
||||
context.set("worker_ids", Arc::new(worker_ids));
|
||||
context.set("worker_ids", worker_ids);
|
||||
Ok(StepResult::Success)
|
||||
} else {
|
||||
// Non-DP-aware path: Register single worker
|
||||
|
||||
310
sgl-router/src/core/workflow/steps/worker_removal.rs
Normal file
310
sgl-router/src/core/workflow/steps/worker_removal.rs
Normal file
@@ -0,0 +1,310 @@
|
||||
//! Worker Removal Workflow Steps
|
||||
//!
|
||||
//! This module implements the workflow steps for removing workers from the router.
|
||||
//! Handles both single worker removal and DP-aware worker removal with prefix matching.
|
||||
//!
|
||||
//! Steps:
|
||||
//! 1. FindWorkersToRemove - Identify workers to remove based on URL (handles DP-aware prefix matching)
|
||||
//! 2. RemoveFromPolicyRegistry - Remove workers from policy registry and cache-aware policies
|
||||
//! 3. RemoveFromWorkerRegistry - Remove workers from worker registry
|
||||
//! 4. UpdateRemainingPolicies - Update cache-aware policies for remaining workers
|
||||
|
||||
use std::{collections::HashSet, sync::Arc};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::{
|
||||
core::{workflow::*, Worker},
|
||||
server::AppContext,
|
||||
};
|
||||
|
||||
/// Request structure for worker removal
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WorkerRemovalRequest {
|
||||
pub url: String,
|
||||
pub dp_aware: bool,
|
||||
}
|
||||
|
||||
/// Step 1: Find workers to remove based on URL
|
||||
pub struct FindWorkersToRemoveStep;
|
||||
|
||||
#[async_trait]
|
||||
impl StepExecutor for FindWorkersToRemoveStep {
|
||||
async fn execute(&self, context: &mut WorkflowContext) -> WorkflowResult<StepResult> {
|
||||
let request: Arc<WorkerRemovalRequest> = context
|
||||
.get("removal_request")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("removal_request".to_string()))?;
|
||||
let app_context: Arc<AppContext> = context
|
||||
.get("app_context")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("app_context".to_string()))?;
|
||||
|
||||
debug!(
|
||||
"Finding workers to remove for {} (dp_aware: {})",
|
||||
request.url, request.dp_aware
|
||||
);
|
||||
|
||||
let workers_to_remove: Vec<Arc<dyn Worker>> = if request.dp_aware {
|
||||
// DP-aware: Find all workers with matching prefix
|
||||
let worker_url_prefix = format!("{}@", request.url);
|
||||
let all_workers = app_context.worker_registry.get_all();
|
||||
|
||||
all_workers
|
||||
.iter()
|
||||
.filter(|worker| worker.url().starts_with(&worker_url_prefix))
|
||||
.cloned()
|
||||
.collect()
|
||||
} else {
|
||||
// Non-DP-aware: Find single worker by exact URL
|
||||
match app_context.worker_registry.get_by_url(&request.url) {
|
||||
Some(worker) => vec![worker],
|
||||
None => Vec::new(),
|
||||
}
|
||||
};
|
||||
|
||||
if workers_to_remove.is_empty() {
|
||||
let error_msg = if request.dp_aware {
|
||||
format!("No workers found with prefix {}@", request.url)
|
||||
} else {
|
||||
format!("Worker {} not found", request.url)
|
||||
};
|
||||
return Err(WorkflowError::StepFailed {
|
||||
step_id: StepId::new("find_workers_to_remove"),
|
||||
message: error_msg,
|
||||
});
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Found {} worker(s) to remove for {}",
|
||||
workers_to_remove.len(),
|
||||
request.url
|
||||
);
|
||||
|
||||
// Store workers and their model IDs for subsequent steps
|
||||
let worker_urls: Vec<String> = workers_to_remove
|
||||
.iter()
|
||||
.map(|w| w.url().to_string())
|
||||
.collect();
|
||||
|
||||
let affected_models: HashSet<String> = workers_to_remove
|
||||
.iter()
|
||||
.map(|w| w.model_id().to_string())
|
||||
.collect();
|
||||
|
||||
context.set("workers_to_remove", workers_to_remove);
|
||||
context.set("worker_urls", worker_urls);
|
||||
context.set("affected_models", affected_models);
|
||||
|
||||
Ok(StepResult::Success)
|
||||
}
|
||||
|
||||
fn is_retryable(&self, _error: &WorkflowError) -> bool {
|
||||
false // Worker not found is not retryable
|
||||
}
|
||||
}
|
||||
|
||||
/// Step 2: Remove workers from policy registry
|
||||
pub struct RemoveFromPolicyRegistryStep;
|
||||
|
||||
#[async_trait]
|
||||
impl StepExecutor for RemoveFromPolicyRegistryStep {
|
||||
async fn execute(&self, context: &mut WorkflowContext) -> WorkflowResult<StepResult> {
|
||||
let app_context: Arc<AppContext> = context
|
||||
.get("app_context")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("app_context".to_string()))?;
|
||||
let workers_to_remove: Arc<Vec<Arc<dyn Worker>>> = context
|
||||
.get("workers_to_remove")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("workers_to_remove".to_string()))?;
|
||||
|
||||
debug!(
|
||||
"Removing {} worker(s) from policy registry",
|
||||
workers_to_remove.len()
|
||||
);
|
||||
|
||||
for worker in workers_to_remove.iter() {
|
||||
let model_id = worker.model_id().to_string();
|
||||
let worker_url = worker.url();
|
||||
|
||||
// Remove from cache-aware policy
|
||||
app_context
|
||||
.policy_registry
|
||||
.remove_worker_from_cache_aware(&model_id, worker_url);
|
||||
|
||||
// Notify policy registry
|
||||
app_context.policy_registry.on_worker_removed(&model_id);
|
||||
|
||||
debug!(
|
||||
"Removed worker {} from policy registry (model: {})",
|
||||
worker_url, model_id
|
||||
);
|
||||
}
|
||||
|
||||
Ok(StepResult::Success)
|
||||
}
|
||||
|
||||
fn is_retryable(&self, _error: &WorkflowError) -> bool {
|
||||
false // Policy removal is not retryable
|
||||
}
|
||||
}
|
||||
|
||||
/// Step 3: Remove workers from worker registry
|
||||
pub struct RemoveFromWorkerRegistryStep;
|
||||
|
||||
#[async_trait]
|
||||
impl StepExecutor for RemoveFromWorkerRegistryStep {
|
||||
async fn execute(&self, context: &mut WorkflowContext) -> WorkflowResult<StepResult> {
|
||||
let app_context: Arc<AppContext> = context
|
||||
.get("app_context")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("app_context".to_string()))?;
|
||||
let worker_urls: Arc<Vec<String>> = context
|
||||
.get("worker_urls")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("worker_urls".to_string()))?;
|
||||
|
||||
debug!(
|
||||
"Removing {} worker(s) from worker registry",
|
||||
worker_urls.len()
|
||||
);
|
||||
|
||||
let mut removed_count = 0;
|
||||
for worker_url in worker_urls.iter() {
|
||||
if app_context
|
||||
.worker_registry
|
||||
.remove_by_url(worker_url)
|
||||
.is_some()
|
||||
{
|
||||
removed_count += 1;
|
||||
debug!("Removed worker {} from registry", worker_url);
|
||||
}
|
||||
}
|
||||
|
||||
if removed_count != worker_urls.len() {
|
||||
return Err(WorkflowError::StepFailed {
|
||||
step_id: StepId::new("remove_from_worker_registry"),
|
||||
message: format!(
|
||||
"Expected to remove {} workers but only removed {}",
|
||||
worker_urls.len(),
|
||||
removed_count
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(StepResult::Success)
|
||||
}
|
||||
|
||||
fn is_retryable(&self, _error: &WorkflowError) -> bool {
|
||||
false // Worker removal is not retryable
|
||||
}
|
||||
}
|
||||
|
||||
/// Step 4: Update cache-aware policies for remaining workers
|
||||
pub struct UpdateRemainingPoliciesStep;
|
||||
|
||||
#[async_trait]
|
||||
impl StepExecutor for UpdateRemainingPoliciesStep {
|
||||
async fn execute(&self, context: &mut WorkflowContext) -> WorkflowResult<StepResult> {
|
||||
let app_context: Arc<AppContext> = context
|
||||
.get("app_context")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("app_context".to_string()))?;
|
||||
let affected_models: Arc<HashSet<String>> = context
|
||||
.get("affected_models")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("affected_models".to_string()))?;
|
||||
let worker_urls: Arc<Vec<String>> = context
|
||||
.get("worker_urls")
|
||||
.ok_or_else(|| WorkflowError::ContextValueNotFound("worker_urls".to_string()))?;
|
||||
|
||||
debug!(
|
||||
"Updating cache-aware policies for {} affected model(s)",
|
||||
affected_models.len()
|
||||
);
|
||||
|
||||
for model_id in affected_models.iter() {
|
||||
let remaining_workers = app_context.worker_registry.get_by_model_fast(model_id);
|
||||
|
||||
if let Some(policy) = app_context.policy_registry.get_policy(model_id) {
|
||||
if policy.name() == "cache_aware" && !remaining_workers.is_empty() {
|
||||
app_context
|
||||
.policy_registry
|
||||
.init_cache_aware_policy(model_id, &remaining_workers);
|
||||
|
||||
debug!(
|
||||
"Updated cache-aware policy for model {} ({} remaining workers)",
|
||||
model_id,
|
||||
remaining_workers.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Log final result at info level
|
||||
if worker_urls.len() == 1 {
|
||||
info!("Removed worker {}", worker_urls[0]);
|
||||
} else {
|
||||
info!(
|
||||
"Removed {} DP-aware workers: {:?}",
|
||||
worker_urls.len(),
|
||||
worker_urls
|
||||
);
|
||||
}
|
||||
|
||||
Ok(StepResult::Success)
|
||||
}
|
||||
|
||||
fn is_retryable(&self, _error: &WorkflowError) -> bool {
|
||||
false // Policy update is not retryable
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a worker removal workflow definition
|
||||
pub fn create_worker_removal_workflow() -> WorkflowDefinition {
|
||||
use std::time::Duration;
|
||||
|
||||
WorkflowDefinition::new("worker_removal", "Remove worker from router")
|
||||
.add_step(
|
||||
StepDefinition::new(
|
||||
"find_workers_to_remove",
|
||||
"Find workers to remove",
|
||||
Arc::new(FindWorkersToRemoveStep),
|
||||
)
|
||||
.with_timeout(Duration::from_secs(10))
|
||||
.with_retry(RetryPolicy {
|
||||
max_attempts: 1,
|
||||
backoff: BackoffStrategy::Fixed(Duration::from_secs(0)),
|
||||
}),
|
||||
)
|
||||
.add_step(
|
||||
StepDefinition::new(
|
||||
"remove_from_policy_registry",
|
||||
"Remove workers from policy registry",
|
||||
Arc::new(RemoveFromPolicyRegistryStep),
|
||||
)
|
||||
.with_timeout(Duration::from_secs(10))
|
||||
.with_retry(RetryPolicy {
|
||||
max_attempts: 1,
|
||||
backoff: BackoffStrategy::Fixed(Duration::from_secs(0)),
|
||||
}),
|
||||
)
|
||||
.add_step(
|
||||
StepDefinition::new(
|
||||
"remove_from_worker_registry",
|
||||
"Remove workers from worker registry",
|
||||
Arc::new(RemoveFromWorkerRegistryStep),
|
||||
)
|
||||
.with_timeout(Duration::from_secs(10))
|
||||
.with_retry(RetryPolicy {
|
||||
max_attempts: 1,
|
||||
backoff: BackoffStrategy::Fixed(Duration::from_secs(0)),
|
||||
}),
|
||||
)
|
||||
.add_step(
|
||||
StepDefinition::new(
|
||||
"update_remaining_policies",
|
||||
"Update cache-aware policies for remaining workers",
|
||||
Arc::new(UpdateRemainingPoliciesStep),
|
||||
)
|
||||
.with_timeout(Duration::from_secs(10))
|
||||
.with_retry(RetryPolicy {
|
||||
max_attempts: 1,
|
||||
backoff: BackoffStrategy::Fixed(Duration::from_secs(0)),
|
||||
}),
|
||||
)
|
||||
}
|
||||
Reference in New Issue
Block a user