[1/2] Support deterministic inference with flashinfer attention backend (#10645)

Co-authored-by: hebiao064 <hebiaobuaa@gmail.com> Co-authored-by: Qiaolin-Yu <liin1211@outlook.com>
2025-09-19 23:34:29 -07:00
parent 1d1ce62495
commit 8ecef73f12
10 changed files with 427 additions and 6 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -406,6 +406,12 @@ class ModelRunner:
                )
            self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)

+        # Enable batch invariant mode
+        if server_args.enable_deterministic_inference:
+            from batch_invariant_ops import enable_batch_invariant_mode
+
+            enable_batch_invariant_mode()
+
        # Init memory pool and attention backends
        self.init_memory_pool(
            min_per_gpu_memory,