Replace prob based with threshold based load balancing (#2170)

2024-11-24 23:17:11 -08:00
parent 8e1adb8441
commit 4b0a1c9365
7 changed files with 223 additions and 151 deletions
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -18,7 +18,8 @@ struct Router {
    worker_urls: Vec<String>,
    policy: PolicyType,
    cache_threshold: f32,
-    cache_routing_prob: f32,
+    balance_abs_threshold: usize,
+    balance_rel_threshold: f32,
    eviction_interval_secs: u64,
    max_tree_size: usize,
 }
@@ -32,7 +33,8 @@ impl Router {
        host = String::from("127.0.0.1"),
        port = 3001,
        cache_threshold = 0.50,
-        cache_routing_prob = 1.0,
+        balance_abs_threshold = 32,
+        balance_rel_threshold = 1.0001,
        eviction_interval_secs = 60,
        max_tree_size = 2usize.pow(24)
    ))]
@@ -42,7 +44,8 @@ impl Router {
        host: String,
        port: u16,
        cache_threshold: f32,
-        cache_routing_prob: f32,
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
        eviction_interval_secs: u64,
        max_tree_size: usize,
    ) -> PyResult<Self> {
@@ -52,7 +55,8 @@ impl Router {
            worker_urls,
            policy,
            cache_threshold,
-            cache_routing_prob,
+            balance_abs_threshold,
+            balance_rel_threshold,
            eviction_interval_secs,
            max_tree_size,
        })
@@ -68,7 +72,8 @@ impl Router {
            PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig,
            PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
                cache_threshold: self.cache_threshold,
-                cache_routing_prob: self.cache_routing_prob,
+                balance_abs_threshold: self.balance_abs_threshold,
+                balance_rel_threshold: self.balance_rel_threshold,
                eviction_interval_secs: self.eviction_interval_secs,
                max_tree_size: self.max_tree_size,
            },
--- a/rust/src/main.rs
+++ b/rust/src/main.rs
@@ -1,4 +1,3 @@
-// src/main.rs
 use clap::Parser;
 use clap::ValueEnum;

@@ -42,7 +41,7 @@ struct Args {
        help = "Load balancing policy to use for request distribution:\n\
              - random: Randomly select workers\n\
              - round_robin: Distribute requests in round-robin fashion\n\
-              - cache_aware: Distribute requests in cache-aware fashion\n"
+              - cache_aware: Distribute requests based on cache state and load balance\n"
    )]
    policy: PolicyType,

@@ -57,12 +56,21 @@ struct Args {

    #[arg(
        long,
-        default_value_t = 1.0,
+        default_value_t = 32,
        requires = "policy",
        required_if_eq("policy", "cache_aware"),
-        help = "Probability of using cache-aware routing (0.0-1.0). Default 1.0 for full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven workloads, use a lower value to better distribute requests"
+        help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32"
    )]
-    cache_routing_prob: f32,
+    balance_abs_threshold: usize,
+
+    #[arg(
+        long,
+        default_value_t = 1.0001,
+        requires = "policy",
+        required_if_eq("policy", "cache_aware"),
+        help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001"
+    )]
+    balance_rel_threshold: f32,

    #[arg(
        long,
@@ -90,7 +98,8 @@ impl Args {
            PolicyType::RoundRobin => PolicyConfig::RoundRobinConfig,
            PolicyType::CacheAware => PolicyConfig::CacheAwareConfig {
                cache_threshold: self.cache_threshold,
-                cache_routing_prob: self.cache_routing_prob,
+                balance_abs_threshold: self.balance_abs_threshold,
+                balance_rel_threshold: self.balance_rel_threshold,
                eviction_interval_secs: self.eviction_interval_secs,
                max_tree_size: self.max_tree_size,
            },
--- a/rust/src/router.rs
+++ b/rust/src/router.rs
@@ -23,65 +23,73 @@ pub enum Router {
    },
    CacheAware {
        /*
-        Cache-Aware Load Balancing Router
+            Cache-Aware Load Balancing Router

-        This router combines two strategies to optimize both cache utilization and request distribution:
+            This router combines two strategies to optimize both cache utilization and request distribution:

-        1. Cache-Aware Routing (Approximate Tree)
-        2. Load Balancing (Shortest Queue)
+            1. Cache-Aware Routing (Approximate Tree)
+            2. Load Balancing (Shortest Queue with Balance Thresholds)

-        For each incoming request, the router chooses between these strategies:
-        - With probability P: Uses cache-aware routing
-        - With probability (1-P): Uses load balancing
-        where P is configured via `cache_routing_prob`
+            The router dynamically switches between these strategies based on load conditions:
+            - Uses load balancing when the system is imbalanced
+            - Uses cache-aware routing when the system is balanced

-        Strategy Details:
+            A system is considered imbalanced if both conditions are met:
+            1. (max - min) > abs_threshold
+            2. max > rel_threshold * min

-        1. Cache-Aware Routing (Approximate Tree)
-        -------------------------------------------
-        This strategy maintains an approximate radix tree for each worker based on request history,
-        eliminating the need for direct cache state queries. The tree stores raw text characters
-        instead of token IDs to avoid tokenization overhead.
+            Strategy Details:

-        Process:
-        a. For each request, find the worker with the highest prefix match
-        b. If match rate > cache_threshold:
-        Route to the worker with highest match (likely has relevant data cached)
-        c. If match rate ≤ cache_threshold:
-        Route to the worker with smallest tree size (most available cache capacity)
-        d. Background maintenance:
-        Periodically evict least recently used leaf nodes to prevent memory overflow
+            1. Cache-Aware Routing (Approximate Tree)
+            -------------------------------------------
+            This strategy maintains an approximate radix tree for each worker based on request history,
+            eliminating the need for direct cache state queries. The tree stores raw text characters
+            instead of token IDs to avoid tokenization overhead.

-        2. Load Balancing (Shortest Queue)
-        -------------------------------------------
-        This strategy tracks pending request counts per worker and routes new requests
-        to the least busy worker for optimal load distribution.
+            Process:
+            a. For each request, find the worker with the highest prefix match
+            b. If match rate > cache_threshold:
+            Route to the worker with highest match (likely has relevant data cached)
+            c. If match rate ≤ cache_threshold:
+            Route to the worker with smallest tree size (most available cache capacity)
+            d. Background maintenance:
+            Periodically evict least recently used leaf nodes to prevent memory overflow

-        Configuration Parameters:
-        ------------------------
-        1. cache_routing_prob: (float, 0.0 to 1.0)
-        - 0.0: Exclusively use load balancing
-        - 1.0: Exclusively use cache-aware routing
-        - Between 0-1: Probability of using cache-aware routing vs load balancing
+            2. Load Balancing (Shortest Queue)
+            -------------------------------------------
+            This strategy tracks pending request counts per worker and routes new requests
+            to the least busy worker when the system is detected to be imbalanced.

-        2. cache_threshold: (float, 0.0 to 1.0)
-        Minimum prefix match ratio to use highest-match routing.
-        Below this threshold, routes to worker with most available cache space.
+            Configuration Parameters:
+            ------------------------
+            1. cache_threshold: (float, 0.0 to 1.0)
+            Minimum prefix match ratio to use highest-match routing.
+            Below this threshold, routes to worker with most available cache space.

-        3. eviction_interval_secs: (integer)
-        Interval between LRU eviction cycles for the approximate trees.
+            2. balance_abs_threshold: (integer)
+            Absolute difference threshold for load imbalance detection.
+            System is potentially imbalanced if (max_load - min_load) > abs_threshold

-        4. max_tree_size: (integer)
-        Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
-        during the next eviction cycle.
+            3. balance_rel_threshold: (float)
+            Relative ratio threshold for load imbalance detection.
+            System is potentially imbalanced if max_load > min_load * rel_threshold
+            Used in conjunction with abs_threshold to determine final imbalance state.
+
+            4. eviction_interval_secs: (integer)
+            Interval between LRU eviction cycles for the approximate trees.
+
+            5. max_tree_size: (integer)
+            Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
+            during the next eviction cycle.
        */
        worker_urls: Vec<String>,
        tree: Arc<Mutex<Tree>>,
        running_queue: Arc<Mutex<HashMap<String, usize>>>,
        processed_queue: Arc<Mutex<HashMap<String, usize>>>,
        cache_threshold: f32,
-        cache_routing_prob: f32,
-        _eviction_thread: Option<thread::JoinHandle<()>>, // Store thread handle
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
+        _eviction_thread: Option<thread::JoinHandle<()>>,
    },
 }

@@ -91,7 +99,8 @@ pub enum PolicyConfig {
    RoundRobinConfig,
    CacheAwareConfig {
        cache_threshold: f32,
-        cache_routing_prob: f32,
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
        eviction_interval_secs: u64,
        max_tree_size: usize,
    },
@@ -128,7 +137,8 @@ impl Router {
            },
            PolicyConfig::CacheAwareConfig {
                cache_threshold,
-                cache_routing_prob,
+                balance_abs_threshold,
+                balance_rel_threshold,
                eviction_interval_secs,
                max_tree_size,
            } => {
@@ -149,6 +159,7 @@ impl Router {
                // Create background eviction thread
                let tree_clone = Arc::clone(&tree);
                let processed_queue_clone = Arc::clone(&processed_queue);
+                let running_queue_clone = Arc::clone(&running_queue);
                let eviction_thread = thread::spawn(move || {
                    loop {
                        // Sleep for the specified interval
@@ -161,6 +172,10 @@ impl Router {
                        // Print the process queue
                        let locked_processed_queue = processed_queue_clone.lock().unwrap();
                        println!("Processed Queue: {:?}", locked_processed_queue);
+
+                        // Print the running queue
+                        let locked_running_queue = running_queue_clone.lock().unwrap();
+                        println!("Running Queue: {:?}", locked_running_queue);
                    }
                });

@@ -174,7 +189,8 @@ impl Router {
                    running_queue,
                    processed_queue,
                    cache_threshold,
-                    cache_routing_prob,
+                    balance_abs_threshold,
+                    balance_rel_threshold,
                    _eviction_thread: Some(eviction_thread),
                }
            }
@@ -203,8 +219,6 @@ impl Router {
        route: &str,
    ) -> HttpResponse {
        let text = get_text_from_request(&body, route);
-        // For Debug
-        // println!("text: {:?}, route: {:?}", text, route);

        let worker_url = match self {
            Router::RoundRobin {
@@ -218,7 +232,6 @@ impl Router {
                        |x| Some((x + 1) % worker_urls.len()),
                    )
                    .unwrap();
-
                worker_urls[idx].clone()
            }

@@ -232,19 +245,42 @@ impl Router {
                running_queue,
                processed_queue,
                cache_threshold,
-                cache_routing_prob,
+                balance_abs_threshold,
+                balance_rel_threshold,
                ..
            } => {
-                // even though the tree is thread-safe, we still put a lock to ensure the whole op (tree read + queue read + tree write + queue write) is atomic to handle some edge cases (e.g. multiple requests with long prefix entering at the same time)
+                // TODO: delay scheduling if cache hit rate is high because it may cause imbalance. prioritize low hit rate ones

                let mut tree = tree.lock().unwrap();
                let mut running_queue = running_queue.lock().unwrap();

-                // Generate a random float between 0 and 1 for probability check
-                let sampled_p: f32 = rand::random();
+                // Get current load statistics
+                let max_load = *running_queue.values().max().unwrap_or(&0);
+                let min_load = *running_queue.values().min().unwrap_or(&0);

-                let selected_url = if sampled_p < *cache_routing_prob {
-                    // Cache-aware routing logic
+                // Load is considered imbalanced if:
+                // 1. (max - min) > abs_threshold AND
+                // 2. max > rel_threshold * min
+                let is_imbalanced = max_load.saturating_sub(min_load) > *balance_abs_threshold
+                    && (max_load as f32) > (min_load as f32 * balance_rel_threshold);
+
+                let selected_url = if is_imbalanced {
+                    // Log load balancing trigger and current queue state
+                    println!(
+                        "Load balancing triggered due to workload imbalance:\n\
+                        Max load: {}, Min load: {}\n\
+                        Current running queue: {:?}",
+                        max_load, min_load, running_queue
+                    );
+
+                    // Use shortest queue routing when load is imbalanced
+                    running_queue
+                        .iter()
+                        .min_by_key(|(_url, &count)| count)
+                        .map(|(url, _)| url.clone())
+                        .unwrap_or_else(|| worker_urls[0].clone())
+                } else {
+                    // Use cache-aware routing when load is balanced
                    let (matched_text, matched_worker) = tree.prefix_match(&text);
                    let matched_rate =
                        matched_text.chars().count() as f32 / text.chars().count() as f32;
@@ -252,36 +288,18 @@ impl Router {
                    if matched_rate > *cache_threshold {
                        matched_worker.to_string()
                    } else {
-                        // For Debug
-                        // let m_map: HashMap<String, usize> = tree
-                        //     .tenant_char_count
-                        //     .iter()
-                        //     .map(|entry| (entry.key().clone(), *entry.value()))
-                        //     .collect();
-
-                        // println!("map: {:?}, mmap: {:?}", tree.get_tenant_char_count(), m_map);
-
                        tree.get_smallest_tenant()
                    }
-                } else {
-                    // Shortest queue routing logic
-                    running_queue
-                        .iter()
-                        .min_by_key(|(_url, &count)| count)
-                        .map(|(url, _)| url.clone())
-                        .unwrap_or_else(|| worker_urls[0].clone())
                };

-                // Update running queue
-                let count = running_queue.get_mut(&selected_url).unwrap();
-                *count += 1;
+                // Update queues and tree
+                *running_queue.get_mut(&selected_url).unwrap() += 1;

-                // Update processed queue
-                let mut locked_processed_queue = processed_queue.lock().unwrap();
-                let count = locked_processed_queue.get_mut(&selected_url).unwrap();
-                *count += 1;
-
-                // Update tree with the new request
+                *processed_queue
+                    .lock()
+                    .unwrap()
+                    .get_mut(&selected_url)
+                    .unwrap() += 1;
                tree.insert(&text, &selected_url);

                selected_url