Migrate XTorch operations to Kunlun operations (accelerating iteration) (#177)

Signed-off-by: dongxinyu03 <dongxinyu03@baidu.com>
2026-02-12 18:13:00 +08:00
parent 744719587e
commit bf9369f733
15 changed files with 125 additions and 119 deletions
--- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -113,7 +113,7 @@ class KunlunCompressedTensorsMoEMethod(FusedMoEMethodBase):
 class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMethod):

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # NOTE: xtorch_ops use max as scale
+        # NOTE: kunlun_ops use max as scale
        with torch.no_grad():
            layer.w13_weight_scale.mul_(127.0)
            layer.w2_weight_scale.mul_(127.0)