Add warnings and remove dependency for deterministic inference (#10724)

Co-authored-by: Yineng Zhang <me@zhyncs.com>
2025-09-22 10:56:02 -07:00
parent 592caab66a
commit aa1c5cf5bd
1 changed files with 21 additions and 14 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -981,29 +981,36 @@ class ServerArgs:
    def _handle_deterministic_inference(self):
        if self.enable_deterministic_inference:
-            import importlib
+            # Check sampling backend
            if not importlib.util.find_spec("batch_invariant_ops"):
                raise ValueError(
                    "batch_invariant_ops is not installed. Please install it from https://github.com/thinking-machines-lab/batch_invariant_ops/."
                )
            # Check some settings
            self.sampling_backend = "pytorch"
            logger.warning(
                "Sampling backend is set to pytorch for deterministic inference."
            )
-            # Currently, only FA3 supports radix cache. Support for other backends is in progress
+
-            if self.attention_backend != "fa3":
+            # Check attention backend
                self.disable_radix_cache = True
                logger.warning(
                    "Currently radix cache is disabled for deterministic inference. It will be supported in the future."
                )
            if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
                raise ValueError(
                    f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
                )
            # Currently, only FA3 supports radix cache. Support for other backends is in progress
            if self.attention_backend != "fa3":
                self.disable_radix_cache = True
                logger.warning(
                    f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
                )
            # Check TP size
            if self.tp_size > 1:
                raise ValueError(
                    "Currently only TP size 1 is supported for deterministic inference."
                )
            # Warnings on MoE models
            logger.warning(
                "Currently deterministic inference is only tested on dense models. Please be cautious when using it on MoE models."
            )
    def _handle_other_validations(self):
        pass