Improve linear.py to load sharded weights & remove the dependency of Parameters from vllm (#2784)

Co-authored-by: SangBin Cho rkooo567@gmail.com
2025-01-07 23:29:10 -08:00
parent 694e41925e
commit 8a6906127a
15 changed files with 655 additions and 88 deletions
--- a/3rdparty/amd/tuning/benchmark_moe_rocm.py
+++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py
@@ -10,7 +10,10 @@ import triton.language as tl
 from tqdm import tqdm
 from transformers import AutoConfig

-from sglang.srt.layers.fused_moe_triton.fused_moe import fused_moe, get_config_file_name
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe,
+    get_config_file_name,
+)

 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0