Revert "fix some typos" (#6244)

2025-05-12 12:53:26 -07:00
parent bad7c26fdc
commit e8e18dcdcc
95 changed files with 276 additions and 276 deletions
--- a/sgl-kernel/README.md
+++ b/sgl-kernel/README.md
@@ -83,11 +83,11 @@ Third-party libraries:

 ### FlashAttention FYI

-  FA3 can fail without a enough shared memory for some shapes, such as higher hidden_dim or some special cases. Right now, FA3 is supported for sm80/sm87 and sm86/sm89.
+  FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89.

  The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.

-  And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use FA3.
+  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.

 ### Kernel Development

@@ -164,7 +164,7 @@ template <>
 struct pytorch_library_compatible_type<int> {
  using type = int64_t;
  static int convert_from_type(int64_t arg) {
-    TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
+    TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted  to int");
    TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int");
    return arg;
  }
--- a/sgl-kernel/benchmark/bench_moe_align_block_size.py
+++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py
@@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
    expert_ids_vllm = torch.zeros_like(expert_ids_cuda)
    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda)

-    # compare the performance of CUDA, triton and vllm implementation
+    # compare the performance of cuda, triton and vllm implementation
    sgl_moe_align_block_size(
        topk_ids,
        num_experts,
@@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider):
            ),
            quantiles=quantiles,
        )
-    else:  # vLLM
+    else:  # vllm
        try:
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: ops.moe_align_block_size(
--- a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh
+++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh
@@ -280,8 +280,8 @@ class CustomAllreduce {
  std::unordered_map<void*, RankData*> buffers_;
  Signal* self_sg_;

-  // Stores rank data from all ranks. This is mainly for CUDA graph purposes.
-  // For CUDA graph to work, all kernel arguments must be fixed during graph
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
  // capture time. However, the peer pointers are not known during graph capture
  // time. Therefore, during capture, we increment the rank data pointer and use
  // that as the argument to the kernel. The kernel arguments are stored in
@@ -291,7 +291,7 @@ class CustomAllreduce {
  //
  // The overall process looks like this:
  // 1. Graph capture.
-  // 2. Each rank obtains the IPC handles for each addresses used during CUDA
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
  // graph capture using get_graph_buffer_ipc_meta.
  // 3. (In Python) all gather the IPC handles.
  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
--- a/sgl-kernel/python/sgl_kernel/init.py
+++ b/sgl-kernel/python/sgl_kernel/init.py
@@ -65,5 +65,5 @@ from sgl_kernel.speculative import (
 from sgl_kernel.version import __version__

 build_tree_kernel = (
-    None  # TODO(ying): remove this after updating the SGLang python code.
+    None  # TODO(ying): remove this after updating the sglang python code.
 )
--- a/sgl-kernel/python/sgl_kernel/flash_attn.py
+++ b/sgl-kernel/python/sgl_kernel/flash_attn.py
@@ -10,14 +10,14 @@ except:


 def is_fa3_supported(device=None) -> bool:
-    #  There some FA3 FYI
+    #  There some fa3 FYI
    #  FA3 can fail without a enough shared memory for a some shapes, such as higher
    #  hidden_dim or some special cases.
-    #  Right now, FA3 is supported for sm80/sm87 and sm86/sm89. The main different
+    #  Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
    #  Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
    #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
-    #  And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a.
-    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use FA3.
+    #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
+    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
    return (
        torch.cuda.get_device_capability(device)[0] == 9
        or torch.cuda.get_device_capability(device)[0] == 8
--- a/sgl-kernel/tests/test_merge_state_v2.py
+++ b/sgl-kernel/tests/test_merge_state_v2.py
@@ -197,7 +197,7 @@ def test_merge_attn_states(
    if not torch.cuda.is_available():
        pytest.skip(
            "Currently only support compare triton merge_attn_states "
-            "with custom CUDA merge_attn_states kernel"
+            "with custom cuda merge_attn_states kernel"
        )

    NUM_TOKENS = num_tokens