[1/2] Support deterministic inference with flashinfer attention backend (#10645)
Co-authored-by: hebiao064 <hebiaobuaa@gmail.com> Co-authored-by: Qiaolin-Yu <liin1211@outlook.com>
This commit is contained in:
@@ -406,6 +406,12 @@ class ModelRunner:
|
||||
)
|
||||
self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
|
||||
|
||||
# Enable batch invariant mode
|
||||
if server_args.enable_deterministic_inference:
|
||||
from batch_invariant_ops import enable_batch_invariant_mode
|
||||
|
||||
enable_batch_invariant_mode()
|
||||
|
||||
# Init memory pool and attention backends
|
||||
self.init_memory_pool(
|
||||
min_per_gpu_memory,
|
||||
|
||||
Reference in New Issue
Block a user