Initial commit for vLLM-Kunlun Plugin

2025-12-10 12:05:39 +08:00
commit c728e52505
131 changed files with 28816 additions and 0 deletions
--- a/vllm_kunlun/models/model_loader/init.py
+++ b/vllm_kunlun/models/model_loader/init.py
--- a/vllm_kunlun/models/model_loader/bitsandbytes_loader.py
+++ b/vllm_kunlun/models/model_loader/bitsandbytes_loader.py
@@ -0,0 +1,24 @@
+class BitsAndBytesModelLoader():
+    """Model loader to load model weights with BitAndBytes quantization."""
+
+    possible_config_file_names = ["adapter_config.json"]
+
+    def __init__(self):
+
+        # Save the module names without sharding.
+        self.unsharded_weights_modules: list[str] = []
+        # Save the module names that are sharded by column.
+        self.column_sharded_weights_modules: list[str] = []
+        # Modules whose weights might have fused on disk
+        # we need their output_sizes to make shard in flight correctly with TP
+        self.maybe_fused_weights_modules: dict[str, list[int]] = {}
+        # Store all module names (from transformers) that support
+        # BNB quantization.
+        self.target_modules: list[str] = []
+        # Store the mapping of expert parameters for MoE models.
+        self.expert_params_mapping: list[tuple[str, str, int, str]] = []
+        # mapping weight names from transformers to vllm.
+        self.weight_mapper: Callable = lambda name: name
+        self.pre_quant: bool = False
+        self.load_8bit: bool = False
+        self.is_pool_model: bool = False