Revert "fix some typos" (#6244)
This commit is contained in:
@@ -83,11 +83,11 @@ Third-party libraries:
|
||||
|
||||
### FlashAttention FYI
|
||||
|
||||
FA3 can fail without a enough shared memory for some shapes, such as higher hidden_dim or some special cases. Right now, FA3 is supported for sm80/sm87 and sm86/sm89.
|
||||
FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89.
|
||||
|
||||
The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
|
||||
|
||||
And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use FA3.
|
||||
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
|
||||
|
||||
### Kernel Development
|
||||
|
||||
@@ -164,7 +164,7 @@ template <>
|
||||
struct pytorch_library_compatible_type<int> {
|
||||
using type = int64_t;
|
||||
static int convert_from_type(int64_t arg) {
|
||||
TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
|
||||
TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
|
||||
TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int");
|
||||
return arg;
|
||||
}
|
||||
|
||||
@@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
|
||||
expert_ids_vllm = torch.zeros_like(expert_ids_cuda)
|
||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda)
|
||||
|
||||
# compare the performance of CUDA, triton and vllm implementation
|
||||
# compare the performance of cuda, triton and vllm implementation
|
||||
sgl_moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
@@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider):
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else: # vLLM
|
||||
else: # vllm
|
||||
try:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: ops.moe_align_block_size(
|
||||
|
||||
@@ -280,8 +280,8 @@ class CustomAllreduce {
|
||||
std::unordered_map<void*, RankData*> buffers_;
|
||||
Signal* self_sg_;
|
||||
|
||||
// Stores rank data from all ranks. This is mainly for CUDA graph purposes.
|
||||
// For CUDA graph to work, all kernel arguments must be fixed during graph
|
||||
// Stores rank data from all ranks. This is mainly for cuda graph purposes.
|
||||
// For cuda graph to work, all kernel arguments must be fixed during graph
|
||||
// capture time. However, the peer pointers are not known during graph capture
|
||||
// time. Therefore, during capture, we increment the rank data pointer and use
|
||||
// that as the argument to the kernel. The kernel arguments are stored in
|
||||
@@ -291,7 +291,7 @@ class CustomAllreduce {
|
||||
//
|
||||
// The overall process looks like this:
|
||||
// 1. Graph capture.
|
||||
// 2. Each rank obtains the IPC handles for each addresses used during CUDA
|
||||
// 2. Each rank obtains the IPC handles for each addresses used during cuda
|
||||
// graph capture using get_graph_buffer_ipc_meta.
|
||||
// 3. (In Python) all gather the IPC handles.
|
||||
// 4. Obtain the peer pointers by opening the IPC handles, and store them in
|
||||
|
||||
@@ -65,5 +65,5 @@ from sgl_kernel.speculative import (
|
||||
from sgl_kernel.version import __version__
|
||||
|
||||
build_tree_kernel = (
|
||||
None # TODO(ying): remove this after updating the SGLang python code.
|
||||
None # TODO(ying): remove this after updating the sglang python code.
|
||||
)
|
||||
|
||||
@@ -10,14 +10,14 @@ except:
|
||||
|
||||
|
||||
def is_fa3_supported(device=None) -> bool:
|
||||
# There some FA3 FYI
|
||||
# There some fa3 FYI
|
||||
# FA3 can fail without a enough shared memory for a some shapes, such as higher
|
||||
# hidden_dim or some special cases.
|
||||
# Right now, FA3 is supported for sm80/sm87 and sm86/sm89. The main different
|
||||
# Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
|
||||
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
|
||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
|
||||
# And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a.
|
||||
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use FA3.
|
||||
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
|
||||
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
|
||||
return (
|
||||
torch.cuda.get_device_capability(device)[0] == 9
|
||||
or torch.cuda.get_device_capability(device)[0] == 8
|
||||
|
||||
@@ -197,7 +197,7 @@ def test_merge_attn_states(
|
||||
if not torch.cuda.is_available():
|
||||
pytest.skip(
|
||||
"Currently only support compare triton merge_attn_states "
|
||||
"with custom CUDA merge_attn_states kernel"
|
||||
"with custom cuda merge_attn_states kernel"
|
||||
)
|
||||
|
||||
NUM_TOKENS = num_tokens
|
||||
|
||||
Reference in New Issue
Block a user