This commit is contained in:
Lianmin Zheng
2025-08-09 13:33:42 -07:00
committed by GitHub
parent 41d71ca488
commit 9a44b643c6
9 changed files with 24 additions and 20 deletions

View File

@@ -1,9 +1,8 @@
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
from __future__ import annotations
import importlib.util
import logging
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, List, Optional
import torch
from torch.nn.parameter import Parameter
@@ -42,11 +41,7 @@ if is_cuda():
try:
from flashinfer import mm_fp4 as fp4_gemm
from flashinfer import (
reorder_rows_for_gated_act_gemm,
shuffle_matrix_a,
shuffle_matrix_sf_a,
)
from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a
enable_flashinfer_fp4_gemm = True
except ImportError: