Improve linear.py to load sharded weights & remove the dependency of Parameters from vllm (#2784)

Co-authored-by: SangBin Cho rkooo567@gmail.com
This commit is contained in:
Lianmin Zheng
2025-01-07 23:29:10 -08:00
committed by GitHub
parent 694e41925e
commit 8a6906127a
15 changed files with 655 additions and 88 deletions

View File

@@ -10,7 +10,10 @@ import triton.language as tl
from tqdm import tqdm
from transformers import AutoConfig
from sglang.srt.layers.fused_moe_triton.fused_moe import fused_moe, get_config_file_name
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
fused_moe,
get_config_file_name,
)
padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0