diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index a613f8a38..2a87f70d1 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -1,3 +1,4 @@ +import os from typing import List, Optional, Tuple import torch @@ -40,6 +41,8 @@ def normalize_e4m3fn_to_e4m3fnuz( def cutlass_block_fp8_supported() -> bool: + if os.environ.get("SUPPORT_CUTLASS_BLOCK_FP8") is None: + return False if _is_cuda: major, minor = torch.cuda.get_device_capability() sm_version = major * 10 + minor