Revert "fix some typos" (#6244)

This commit is contained in:
Lianmin Zheng
2025-05-12 12:53:26 -07:00
committed by GitHub
parent bad7c26fdc
commit e8e18dcdcc
95 changed files with 276 additions and 276 deletions

View File

@@ -83,11 +83,11 @@ Third-party libraries:
### FlashAttention FYI
FA3 can fail without a enough shared memory for some shapes, such as higher hidden_dim or some special cases. Right now, FA3 is supported for sm80/sm87 and sm86/sm89.
FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89.
The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use FA3.
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
### Kernel Development
@@ -164,7 +164,7 @@ template <>
struct pytorch_library_compatible_type<int> {
using type = int64_t;
static int convert_from_type(int64_t arg) {
TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int");
return arg;
}

View File

@@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
expert_ids_vllm = torch.zeros_like(expert_ids_cuda)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda)
# compare the performance of CUDA, triton and vllm implementation
# compare the performance of cuda, triton and vllm implementation
sgl_moe_align_block_size(
topk_ids,
num_experts,
@@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider):
),
quantiles=quantiles,
)
else: # vLLM
else: # vllm
try:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(

View File

@@ -280,8 +280,8 @@ class CustomAllreduce {
std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_;
// Stores rank data from all ranks. This is mainly for CUDA graph purposes.
// For CUDA graph to work, all kernel arguments must be fixed during graph
// Stores rank data from all ranks. This is mainly for cuda graph purposes.
// For cuda graph to work, all kernel arguments must be fixed during graph
// capture time. However, the peer pointers are not known during graph capture
// time. Therefore, during capture, we increment the rank data pointer and use
// that as the argument to the kernel. The kernel arguments are stored in
@@ -291,7 +291,7 @@ class CustomAllreduce {
//
// The overall process looks like this:
// 1. Graph capture.
// 2. Each rank obtains the IPC handles for each addresses used during CUDA
// 2. Each rank obtains the IPC handles for each addresses used during cuda
// graph capture using get_graph_buffer_ipc_meta.
// 3. (In Python) all gather the IPC handles.
// 4. Obtain the peer pointers by opening the IPC handles, and store them in

View File

@@ -65,5 +65,5 @@ from sgl_kernel.speculative import (
from sgl_kernel.version import __version__
build_tree_kernel = (
None # TODO(ying): remove this after updating the SGLang python code.
None # TODO(ying): remove this after updating the sglang python code.
)

View File

@@ -10,14 +10,14 @@ except:
def is_fa3_supported(device=None) -> bool:
# There some FA3 FYI
# There some fa3 FYI
# FA3 can fail without a enough shared memory for a some shapes, such as higher
# hidden_dim or some special cases.
# Right now, FA3 is supported for sm80/sm87 and sm86/sm89. The main different
# Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
# And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a.
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use FA3.
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
return (
torch.cuda.get_device_capability(device)[0] == 9
or torch.cuda.get_device_capability(device)[0] == 8

View File

@@ -197,7 +197,7 @@ def test_merge_attn_states(
if not torch.cuda.is_available():
pytest.skip(
"Currently only support compare triton merge_attn_states "
"with custom CUDA merge_attn_states kernel"
"with custom cuda merge_attn_states kernel"
)
NUM_TOKENS = num_tokens