Improve linear.py to load sharded weights & remove the dependency of Parameters from vllm (#2784)
Co-authored-by: SangBin Cho rkooo567@gmail.com
This commit is contained in:
5
3rdparty/amd/tuning/benchmark_moe_rocm.py
vendored
5
3rdparty/amd/tuning/benchmark_moe_rocm.py
vendored
@@ -10,7 +10,10 @@ import triton.language as tl
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoConfig
|
||||
|
||||
from sglang.srt.layers.fused_moe_triton.fused_moe import fused_moe, get_config_file_name
|
||||
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
|
||||
fused_moe,
|
||||
get_config_file_name,
|
||||
)
|
||||
|
||||
padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user