Add warnings and remove dependency for deterministic inference (#10724)
Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
@@ -981,29 +981,36 @@ class ServerArgs:
|
|||||||
|
|
||||||
def _handle_deterministic_inference(self):
|
def _handle_deterministic_inference(self):
|
||||||
if self.enable_deterministic_inference:
|
if self.enable_deterministic_inference:
|
||||||
import importlib
|
# Check sampling backend
|
||||||
|
|
||||||
if not importlib.util.find_spec("batch_invariant_ops"):
|
|
||||||
raise ValueError(
|
|
||||||
"batch_invariant_ops is not installed. Please install it from https://github.com/thinking-machines-lab/batch_invariant_ops/."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check some settings
|
|
||||||
self.sampling_backend = "pytorch"
|
self.sampling_backend = "pytorch"
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Sampling backend is set to pytorch for deterministic inference."
|
"Sampling backend is set to pytorch for deterministic inference."
|
||||||
)
|
)
|
||||||
# Currently, only FA3 supports radix cache. Support for other backends is in progress
|
|
||||||
if self.attention_backend != "fa3":
|
# Check attention backend
|
||||||
self.disable_radix_cache = True
|
|
||||||
logger.warning(
|
|
||||||
"Currently radix cache is disabled for deterministic inference. It will be supported in the future."
|
|
||||||
)
|
|
||||||
if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
|
if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
|
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Currently, only FA3 supports radix cache. Support for other backends is in progress
|
||||||
|
if self.attention_backend != "fa3":
|
||||||
|
self.disable_radix_cache = True
|
||||||
|
logger.warning(
|
||||||
|
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check TP size
|
||||||
|
if self.tp_size > 1:
|
||||||
|
raise ValueError(
|
||||||
|
"Currently only TP size 1 is supported for deterministic inference."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Warnings on MoE models
|
||||||
|
logger.warning(
|
||||||
|
"Currently deterministic inference is only tested on dense models. Please be cautious when using it on MoE models."
|
||||||
|
)
|
||||||
|
|
||||||
def _handle_other_validations(self):
|
def _handle_other_validations(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user