From 00d25a7f5e2fcc664f5116cffeddc4b4d56a2057 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 10 Mar 2025 03:06:21 -0700 Subject: [PATCH] Fix quantization and nightly tests (#4258) --- .../layers/moe/fused_moe_triton/fused_moe.py | 1 - .../srt/layers/quantization/__init__.py | 156 ++++++++++-------- .../sglang/srt/model_executor/model_runner.py | 4 + python/sglang/test/test_utils.py | 5 +- test/srt/run_suite.py | 1 + test/srt/test_awq.py | 44 +++++ test/srt/test_nightly_gsm8k_eval.py | 1 + 7 files changed, 142 insertions(+), 70 deletions(-) create mode 100644 test/srt/test_awq.py diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 0d316c6f7..71c3d8ebe 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -23,7 +23,6 @@ from sglang.srt.utils import ( direct_register_custom_op, get_bool_env_var, get_device_name, - is_cuda_available, is_hip, ) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index c09fb5a1a..bc1bedee1 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -1,4 +1,6 @@ # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py +import builtins +import inspect import re from copy import deepcopy from typing import Callable, Dict, Optional, Type, Union @@ -6,10 +8,7 @@ from typing import Callable, Dict, Optional, Type, Union import torch from vllm.model_executor.layers.quantization.aqlm import AQLMConfig from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.layers.quantization.awq_marlin import ( - AWQMarlinConfig, - AWQMoEMethod, -) +from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( CompressedTensorsConfig, @@ -180,96 +179,117 @@ def gptq_get_quant_method(self, layer, prefix): return None -def awq_get_quant_method(self, layer, prefix): - from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq - from vllm.model_executor.layers.quantization.awq_marlin import ( - AWQMarlinLinearMethod, - AWQMoEMethod, - ) - - from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod - from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE - from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead - - if isinstance(layer, LinearBase) or ( - isinstance(layer, ParallelLMHead) and self.lm_head_quantized - ): - if is_layer_skipped_awq(prefix, self.modules_to_not_convert): - return UnquantizedLinearMethod() - return AWQMarlinLinearMethod(self) - elif isinstance(layer, FusedMoE): - return AWQMoEMethod(self) - return None +original_isinstance = builtins.isinstance -original_awq_moe_method_apply = AWQMoEMethod.apply +def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False): + """ + Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig + can recognize sglang layers + """ + if reverse: + builtins.isinstance = original_isinstance + return -def awq_moe_method_apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - **kwargs, -): - return original_awq_moe_method_apply( - self, - layer, - x, - router_logits, - top_k, - renormalize, - use_grouped_topk, - topk_group, - num_expert_group, - custom_routing_function, - scoring_func, - e_score_correction_bias, - ) - - -def patch_vllm_linear_base_isinstance(): - import builtins - + from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import LinearBase + from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, + ) from sglang.srt.layers.linear import LinearBase as PatchedLinearBase - - original_isinstance = builtins.isinstance + from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE + from sglang.srt.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding as PatchedVocabParallelEmbedding, + ) def patched_isinstance(obj, classinfo): if classinfo is LinearBase: return original_isinstance(obj, PatchedLinearBase) + if classinfo is FusedMoE: + return original_isinstance(obj, PatchedFusedMoE) + if classinfo is VocabParallelEmbedding: + return original_isinstance(obj, PatchedVocabParallelEmbedding) return original_isinstance(obj, classinfo) builtins.isinstance = patched_isinstance -def apply_monkey_patches(): +def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"): + """ + Monkey patch the apply function of vllm's FusedMoEMethodBase. + Convert sglang arguments to vllm arguments. + """ + original_apply = class_obj.apply + sig = inspect.signature(original_apply) + param_names = list(sig.parameters.keys()) + has_correction_bias = "e_score_correction_bias" in param_names + + def new_apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + correction_bias: Optional[torch.Tensor] = None, + activation: str = "silu", + inplace: bool = True, + no_combine: bool = False, + ): + assert activation == "silu" + assert inplace and not no_combine + + kwargs = { + "self": self, + "layer": layer, + "x": x, + "router_logits": router_logits, + "top_k": top_k, + "renormalize": renormalize, + "use_grouped_topk": use_grouped_topk, + "topk_group": topk_group, + "num_expert_group": num_expert_group, + "custom_routing_function": custom_routing_function, + } + if correction_bias is not None: + if not has_correction_bias: + raise ValueError( + "Please increase the version of your vllm. Try `pip install vllm==0.7.2`" + ) + kwargs["e_score_correction_bias"] = correction_bias + return original_apply(**kwargs) + + setattr(class_obj, "apply", new_apply) + + +def monkey_patch_quant_configs(): """Apply all monkey patches in one place.""" from vllm.model_executor.layers.quantization.awq_marlin import AWQMoEMethod + from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( + CompressedTensorsW8A8Fp8MoEMethod, + CompressedTensorsWNA16MoEMethod, + ) + from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinMoEMethod setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method) setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method) - setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method) - setattr(AWQMoEMethod, "apply", awq_moe_method_apply) + + monkey_patch_moe_apply(AWQMoEMethod) + monkey_patch_moe_apply(GPTQMarlinMoEMethod) + monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod) + monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod) -patch_vllm_linear_base_isinstance() -# Apply patches when module is imported -apply_monkey_patches() +monkey_patch_quant_configs() __all__ = [ - "QuantizationConfig", "get_quantization_config", "QUANTIZATION_METHODS", ] diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 8040709a7..58ae425b1 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -41,6 +41,7 @@ from sglang.srt.layers.dp_attention import ( initialize_dp_attention, ) from sglang.srt.layers.logits_processor import LogitsProcessorOutput +from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer from sglang.srt.layers.sampler import Sampler from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model from sglang.srt.lora.lora_manager import LoRAManager @@ -341,6 +342,8 @@ class ModelRunner: # Load the model # Remove monkey_patch when linear.py quant remove dependencies with vllm monkey_patch_vllm_parallel_state() + monkey_patch_isinstance_for_vllm_base_layer() + with self.memory_saver_adapter.region(): self.model = get_model( model_config=self.model_config, @@ -348,6 +351,7 @@ class ModelRunner: device_config=DeviceConfig(self.device), ) monkey_patch_vllm_parallel_state(reverse=True) + monkey_patch_isinstance_for_vllm_base_layer(reverse=True) if self.server_args.kv_cache_dtype == "fp8_e4m3": if self.server_args.quantization_param_path is not None: diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index e2d92bb51..d9964cd50 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -36,12 +36,15 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instru DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" +DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = ( + "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" +) DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" -DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct" DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 590c46136..413cef32c 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -22,6 +22,7 @@ suites = { TestFile("models/test_reward_models.py", 83), TestFile("models/test_gme_qwen_models.py", 45), TestFile("test_abort.py", 51), + TestFile("test_awq.py"), TestFile("test_block_int8.py", 22), TestFile("test_chunked_prefill.py", 336), TestFile("test_eagle_infer.py", 447), diff --git a/test/srt/test_awq.py b/test/srt/test_awq.py new file mode 100644 index 000000000..30493634d --- /dev/null +++ b/test/srt/test_awq.py @@ -0,0 +1,44 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestAWQ(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--trust-remote-code"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + ) + + metrics = run_eval(args) + self.assertGreater(metrics["score"], 0.65) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 6fe361715..81eee4692 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -38,6 +38,7 @@ MODEL_SCORE_THRESHOLDS = { "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83, + "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4": 0.60, }