Cleanup readme, llava examples, usage examples and nccl init (#1194)

2024-08-24 08:02:23 -07:00
parent c9064e6fd9
commit f6af3a6561
65 changed files with 174 additions and 317 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -81,13 +81,12 @@ class ServerArgs:
    disable_cuda_graph: bool = False
    disable_cuda_graph_padding: bool = False
    disable_disk_cache: bool = False
+    disable_custom_all_reduce: bool = False
    enable_mixed_chunk: bool = False
    enable_torch_compile: bool = False
    enable_p2p_check: bool = False
    enable_mla: bool = False
-    attention_reduce_in_fp32: bool = False
-    efficient_weight_load: bool = False
-    disable_custom_all_reduce: bool = False
+    triton_attention_reduce_in_fp32: bool = False

    # Distributed args
    nccl_init_addr: Optional[str] = None
@@ -404,6 +403,12 @@ class ServerArgs:
            action="store_true",
            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
        )
+        parser.add_argument(
+            "--disable-custom-all-reduce",
+            action="store_true",
+            default=False,
+            help="Disable the custom all-reduce kernel and fall back to NCCL.",
+        )
        parser.add_argument(
            "--enable-mixed-chunk",
            action="store_true",
@@ -425,7 +430,7 @@ class ServerArgs:
            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
        )
        parser.add_argument(
-            "--attention-reduce-in-fp32",
+            "--triton-attention-reduce-in-fp32",
            action="store_true",
            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
            "This only affects Triton attention kernels.",
@@ -435,12 +440,6 @@ class ServerArgs:
            action="store_true",
            help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
        )
-        parser.add_argument(
-            "--disable-custom-all-reduce",
-            action="store_true",
-            default=False,
-            help="Disable the custom all-reduce kernel and fall back to NCCL.",
-        )

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):