[1/2] Support deterministic inference with flashinfer attention backend (#10645)

Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
Co-authored-by: Qiaolin-Yu <liin1211@outlook.com>
This commit is contained in:
Baizhou Zhang
2025-09-19 23:34:29 -07:00
committed by GitHub
parent 1d1ce62495
commit 8ecef73f12
10 changed files with 427 additions and 6 deletions

View File

@@ -14,6 +14,7 @@
"""Fused operators for normalization layers."""
import logging
import os
from typing import Optional, Tuple, Union
import torch
@@ -80,6 +81,8 @@ class RMSNorm(CustomOp):
)
if _use_aiter:
self._forward_method = self.forward_aiter
if os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] == "1":
self._forward_method = self.forward_native
def forward_cuda(
self,