Replace prob based with threshold based load balancing (#2170)
This commit is contained in:
@@ -18,7 +18,8 @@ struct Router {
|
||||
worker_urls: Vec<String>,
|
||||
policy: PolicyType,
|
||||
cache_threshold: f32,
|
||||
cache_routing_prob: f32,
|
||||
balance_abs_threshold: usize,
|
||||
balance_rel_threshold: f32,
|
||||
eviction_interval_secs: u64,
|
||||
max_tree_size: usize,
|
||||
}
|
||||
@@ -32,7 +33,8 @@ impl Router {
|
||||
host = String::from("127.0.0.1"),
|
||||
port = 3001,
|
||||
cache_threshold = 0.50,
|
||||
cache_routing_prob = 1.0,
|
||||
balance_abs_threshold = 32,
|
||||
balance_rel_threshold = 1.0001,
|
||||
eviction_interval_secs = 60,
|
||||
max_tree_size = 2usize.pow(24)
|
||||
))]
|
||||
@@ -42,7 +44,8 @@ impl Router {
|
||||
host: String,
|
||||
port: u16,
|
||||
cache_threshold: f32,
|
||||
cache_routing_prob: f32,
|
||||
balance_abs_threshold: usize,
|
||||
balance_rel_threshold: f32,
|
||||
eviction_interval_secs: u64,
|
||||
max_tree_size: usize,
|
||||
) -> PyResult<Self> {
|
||||
@@ -52,7 +55,8 @@ impl Router {
|
||||
worker_urls,
|
||||
policy,
|
||||
cache_threshold,
|
||||
cache_routing_prob,
|
||||
balance_abs_threshold,
|
||||
balance_rel_threshold,
|
||||
eviction_interval_secs,
|
||||
max_tree_size,
|
||||
})
|
||||
@@ -68,7 +72,8 @@ impl Router {
|
||||
PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig,
|
||||
PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
|
||||
cache_threshold: self.cache_threshold,
|
||||
cache_routing_prob: self.cache_routing_prob,
|
||||
balance_abs_threshold: self.balance_abs_threshold,
|
||||
balance_rel_threshold: self.balance_rel_threshold,
|
||||
eviction_interval_secs: self.eviction_interval_secs,
|
||||
max_tree_size: self.max_tree_size,
|
||||
},
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
// src/main.rs
|
||||
use clap::Parser;
|
||||
use clap::ValueEnum;
|
||||
|
||||
@@ -42,7 +41,7 @@ struct Args {
|
||||
help = "Load balancing policy to use for request distribution:\n\
|
||||
- random: Randomly select workers\n\
|
||||
- round_robin: Distribute requests in round-robin fashion\n\
|
||||
- cache_aware: Distribute requests in cache-aware fashion\n"
|
||||
- cache_aware: Distribute requests based on cache state and load balance\n"
|
||||
)]
|
||||
policy: PolicyType,
|
||||
|
||||
@@ -57,12 +56,21 @@ struct Args {
|
||||
|
||||
#[arg(
|
||||
long,
|
||||
default_value_t = 1.0,
|
||||
default_value_t = 32,
|
||||
requires = "policy",
|
||||
required_if_eq("policy", "cache_aware"),
|
||||
help = "Probability of using cache-aware routing (0.0-1.0). Default 1.0 for full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven workloads, use a lower value to better distribute requests"
|
||||
help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32"
|
||||
)]
|
||||
cache_routing_prob: f32,
|
||||
balance_abs_threshold: usize,
|
||||
|
||||
#[arg(
|
||||
long,
|
||||
default_value_t = 1.0001,
|
||||
requires = "policy",
|
||||
required_if_eq("policy", "cache_aware"),
|
||||
help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001"
|
||||
)]
|
||||
balance_rel_threshold: f32,
|
||||
|
||||
#[arg(
|
||||
long,
|
||||
@@ -90,7 +98,8 @@ impl Args {
|
||||
PolicyType::RoundRobin => PolicyConfig::RoundRobinConfig,
|
||||
PolicyType::CacheAware => PolicyConfig::CacheAwareConfig {
|
||||
cache_threshold: self.cache_threshold,
|
||||
cache_routing_prob: self.cache_routing_prob,
|
||||
balance_abs_threshold: self.balance_abs_threshold,
|
||||
balance_rel_threshold: self.balance_rel_threshold,
|
||||
eviction_interval_secs: self.eviction_interval_secs,
|
||||
max_tree_size: self.max_tree_size,
|
||||
},
|
||||
|
||||
@@ -23,65 +23,73 @@ pub enum Router {
|
||||
},
|
||||
CacheAware {
|
||||
/*
|
||||
Cache-Aware Load Balancing Router
|
||||
Cache-Aware Load Balancing Router
|
||||
|
||||
This router combines two strategies to optimize both cache utilization and request distribution:
|
||||
This router combines two strategies to optimize both cache utilization and request distribution:
|
||||
|
||||
1. Cache-Aware Routing (Approximate Tree)
|
||||
2. Load Balancing (Shortest Queue)
|
||||
1. Cache-Aware Routing (Approximate Tree)
|
||||
2. Load Balancing (Shortest Queue with Balance Thresholds)
|
||||
|
||||
For each incoming request, the router chooses between these strategies:
|
||||
- With probability P: Uses cache-aware routing
|
||||
- With probability (1-P): Uses load balancing
|
||||
where P is configured via `cache_routing_prob`
|
||||
The router dynamically switches between these strategies based on load conditions:
|
||||
- Uses load balancing when the system is imbalanced
|
||||
- Uses cache-aware routing when the system is balanced
|
||||
|
||||
Strategy Details:
|
||||
A system is considered imbalanced if both conditions are met:
|
||||
1. (max - min) > abs_threshold
|
||||
2. max > rel_threshold * min
|
||||
|
||||
1. Cache-Aware Routing (Approximate Tree)
|
||||
-------------------------------------------
|
||||
This strategy maintains an approximate radix tree for each worker based on request history,
|
||||
eliminating the need for direct cache state queries. The tree stores raw text characters
|
||||
instead of token IDs to avoid tokenization overhead.
|
||||
Strategy Details:
|
||||
|
||||
Process:
|
||||
a. For each request, find the worker with the highest prefix match
|
||||
b. If match rate > cache_threshold:
|
||||
Route to the worker with highest match (likely has relevant data cached)
|
||||
c. If match rate ≤ cache_threshold:
|
||||
Route to the worker with smallest tree size (most available cache capacity)
|
||||
d. Background maintenance:
|
||||
Periodically evict least recently used leaf nodes to prevent memory overflow
|
||||
1. Cache-Aware Routing (Approximate Tree)
|
||||
-------------------------------------------
|
||||
This strategy maintains an approximate radix tree for each worker based on request history,
|
||||
eliminating the need for direct cache state queries. The tree stores raw text characters
|
||||
instead of token IDs to avoid tokenization overhead.
|
||||
|
||||
2. Load Balancing (Shortest Queue)
|
||||
-------------------------------------------
|
||||
This strategy tracks pending request counts per worker and routes new requests
|
||||
to the least busy worker for optimal load distribution.
|
||||
Process:
|
||||
a. For each request, find the worker with the highest prefix match
|
||||
b. If match rate > cache_threshold:
|
||||
Route to the worker with highest match (likely has relevant data cached)
|
||||
c. If match rate ≤ cache_threshold:
|
||||
Route to the worker with smallest tree size (most available cache capacity)
|
||||
d. Background maintenance:
|
||||
Periodically evict least recently used leaf nodes to prevent memory overflow
|
||||
|
||||
Configuration Parameters:
|
||||
------------------------
|
||||
1. cache_routing_prob: (float, 0.0 to 1.0)
|
||||
- 0.0: Exclusively use load balancing
|
||||
- 1.0: Exclusively use cache-aware routing
|
||||
- Between 0-1: Probability of using cache-aware routing vs load balancing
|
||||
2. Load Balancing (Shortest Queue)
|
||||
-------------------------------------------
|
||||
This strategy tracks pending request counts per worker and routes new requests
|
||||
to the least busy worker when the system is detected to be imbalanced.
|
||||
|
||||
2. cache_threshold: (float, 0.0 to 1.0)
|
||||
Minimum prefix match ratio to use highest-match routing.
|
||||
Below this threshold, routes to worker with most available cache space.
|
||||
Configuration Parameters:
|
||||
------------------------
|
||||
1. cache_threshold: (float, 0.0 to 1.0)
|
||||
Minimum prefix match ratio to use highest-match routing.
|
||||
Below this threshold, routes to worker with most available cache space.
|
||||
|
||||
3. eviction_interval_secs: (integer)
|
||||
Interval between LRU eviction cycles for the approximate trees.
|
||||
2. balance_abs_threshold: (integer)
|
||||
Absolute difference threshold for load imbalance detection.
|
||||
System is potentially imbalanced if (max_load - min_load) > abs_threshold
|
||||
|
||||
4. max_tree_size: (integer)
|
||||
Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
|
||||
during the next eviction cycle.
|
||||
3. balance_rel_threshold: (float)
|
||||
Relative ratio threshold for load imbalance detection.
|
||||
System is potentially imbalanced if max_load > min_load * rel_threshold
|
||||
Used in conjunction with abs_threshold to determine final imbalance state.
|
||||
|
||||
4. eviction_interval_secs: (integer)
|
||||
Interval between LRU eviction cycles for the approximate trees.
|
||||
|
||||
5. max_tree_size: (integer)
|
||||
Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
|
||||
during the next eviction cycle.
|
||||
*/
|
||||
worker_urls: Vec<String>,
|
||||
tree: Arc<Mutex<Tree>>,
|
||||
running_queue: Arc<Mutex<HashMap<String, usize>>>,
|
||||
processed_queue: Arc<Mutex<HashMap<String, usize>>>,
|
||||
cache_threshold: f32,
|
||||
cache_routing_prob: f32,
|
||||
_eviction_thread: Option<thread::JoinHandle<()>>, // Store thread handle
|
||||
balance_abs_threshold: usize,
|
||||
balance_rel_threshold: f32,
|
||||
_eviction_thread: Option<thread::JoinHandle<()>>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -91,7 +99,8 @@ pub enum PolicyConfig {
|
||||
RoundRobinConfig,
|
||||
CacheAwareConfig {
|
||||
cache_threshold: f32,
|
||||
cache_routing_prob: f32,
|
||||
balance_abs_threshold: usize,
|
||||
balance_rel_threshold: f32,
|
||||
eviction_interval_secs: u64,
|
||||
max_tree_size: usize,
|
||||
},
|
||||
@@ -128,7 +137,8 @@ impl Router {
|
||||
},
|
||||
PolicyConfig::CacheAwareConfig {
|
||||
cache_threshold,
|
||||
cache_routing_prob,
|
||||
balance_abs_threshold,
|
||||
balance_rel_threshold,
|
||||
eviction_interval_secs,
|
||||
max_tree_size,
|
||||
} => {
|
||||
@@ -149,6 +159,7 @@ impl Router {
|
||||
// Create background eviction thread
|
||||
let tree_clone = Arc::clone(&tree);
|
||||
let processed_queue_clone = Arc::clone(&processed_queue);
|
||||
let running_queue_clone = Arc::clone(&running_queue);
|
||||
let eviction_thread = thread::spawn(move || {
|
||||
loop {
|
||||
// Sleep for the specified interval
|
||||
@@ -161,6 +172,10 @@ impl Router {
|
||||
// Print the process queue
|
||||
let locked_processed_queue = processed_queue_clone.lock().unwrap();
|
||||
println!("Processed Queue: {:?}", locked_processed_queue);
|
||||
|
||||
// Print the running queue
|
||||
let locked_running_queue = running_queue_clone.lock().unwrap();
|
||||
println!("Running Queue: {:?}", locked_running_queue);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -174,7 +189,8 @@ impl Router {
|
||||
running_queue,
|
||||
processed_queue,
|
||||
cache_threshold,
|
||||
cache_routing_prob,
|
||||
balance_abs_threshold,
|
||||
balance_rel_threshold,
|
||||
_eviction_thread: Some(eviction_thread),
|
||||
}
|
||||
}
|
||||
@@ -203,8 +219,6 @@ impl Router {
|
||||
route: &str,
|
||||
) -> HttpResponse {
|
||||
let text = get_text_from_request(&body, route);
|
||||
// For Debug
|
||||
// println!("text: {:?}, route: {:?}", text, route);
|
||||
|
||||
let worker_url = match self {
|
||||
Router::RoundRobin {
|
||||
@@ -218,7 +232,6 @@ impl Router {
|
||||
|x| Some((x + 1) % worker_urls.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
worker_urls[idx].clone()
|
||||
}
|
||||
|
||||
@@ -232,19 +245,42 @@ impl Router {
|
||||
running_queue,
|
||||
processed_queue,
|
||||
cache_threshold,
|
||||
cache_routing_prob,
|
||||
balance_abs_threshold,
|
||||
balance_rel_threshold,
|
||||
..
|
||||
} => {
|
||||
// even though the tree is thread-safe, we still put a lock to ensure the whole op (tree read + queue read + tree write + queue write) is atomic to handle some edge cases (e.g. multiple requests with long prefix entering at the same time)
|
||||
// TODO: delay scheduling if cache hit rate is high because it may cause imbalance. prioritize low hit rate ones
|
||||
|
||||
let mut tree = tree.lock().unwrap();
|
||||
let mut running_queue = running_queue.lock().unwrap();
|
||||
|
||||
// Generate a random float between 0 and 1 for probability check
|
||||
let sampled_p: f32 = rand::random();
|
||||
// Get current load statistics
|
||||
let max_load = *running_queue.values().max().unwrap_or(&0);
|
||||
let min_load = *running_queue.values().min().unwrap_or(&0);
|
||||
|
||||
let selected_url = if sampled_p < *cache_routing_prob {
|
||||
// Cache-aware routing logic
|
||||
// Load is considered imbalanced if:
|
||||
// 1. (max - min) > abs_threshold AND
|
||||
// 2. max > rel_threshold * min
|
||||
let is_imbalanced = max_load.saturating_sub(min_load) > *balance_abs_threshold
|
||||
&& (max_load as f32) > (min_load as f32 * balance_rel_threshold);
|
||||
|
||||
let selected_url = if is_imbalanced {
|
||||
// Log load balancing trigger and current queue state
|
||||
println!(
|
||||
"Load balancing triggered due to workload imbalance:\n\
|
||||
Max load: {}, Min load: {}\n\
|
||||
Current running queue: {:?}",
|
||||
max_load, min_load, running_queue
|
||||
);
|
||||
|
||||
// Use shortest queue routing when load is imbalanced
|
||||
running_queue
|
||||
.iter()
|
||||
.min_by_key(|(_url, &count)| count)
|
||||
.map(|(url, _)| url.clone())
|
||||
.unwrap_or_else(|| worker_urls[0].clone())
|
||||
} else {
|
||||
// Use cache-aware routing when load is balanced
|
||||
let (matched_text, matched_worker) = tree.prefix_match(&text);
|
||||
let matched_rate =
|
||||
matched_text.chars().count() as f32 / text.chars().count() as f32;
|
||||
@@ -252,36 +288,18 @@ impl Router {
|
||||
if matched_rate > *cache_threshold {
|
||||
matched_worker.to_string()
|
||||
} else {
|
||||
// For Debug
|
||||
// let m_map: HashMap<String, usize> = tree
|
||||
// .tenant_char_count
|
||||
// .iter()
|
||||
// .map(|entry| (entry.key().clone(), *entry.value()))
|
||||
// .collect();
|
||||
|
||||
// println!("map: {:?}, mmap: {:?}", tree.get_tenant_char_count(), m_map);
|
||||
|
||||
tree.get_smallest_tenant()
|
||||
}
|
||||
} else {
|
||||
// Shortest queue routing logic
|
||||
running_queue
|
||||
.iter()
|
||||
.min_by_key(|(_url, &count)| count)
|
||||
.map(|(url, _)| url.clone())
|
||||
.unwrap_or_else(|| worker_urls[0].clone())
|
||||
};
|
||||
|
||||
// Update running queue
|
||||
let count = running_queue.get_mut(&selected_url).unwrap();
|
||||
*count += 1;
|
||||
// Update queues and tree
|
||||
*running_queue.get_mut(&selected_url).unwrap() += 1;
|
||||
|
||||
// Update processed queue
|
||||
let mut locked_processed_queue = processed_queue.lock().unwrap();
|
||||
let count = locked_processed_queue.get_mut(&selected_url).unwrap();
|
||||
*count += 1;
|
||||
|
||||
// Update tree with the new request
|
||||
*processed_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get_mut(&selected_url)
|
||||
.unwrap() += 1;
|
||||
tree.insert(&text, &selected_url);
|
||||
|
||||
selected_url
|
||||
|
||||
Reference in New Issue
Block a user