Optional extension for green context (#9231)
This commit is contained in:
@@ -274,7 +274,6 @@ set(SOURCES
|
|||||||
"csrc/kvcacheio/transfer.cu"
|
"csrc/kvcacheio/transfer.cu"
|
||||||
"csrc/speculative/eagle_utils.cu"
|
"csrc/speculative/eagle_utils.cu"
|
||||||
"csrc/speculative/packbit.cu"
|
"csrc/speculative/packbit.cu"
|
||||||
"csrc/spatial/greenctx_stream.cu"
|
|
||||||
"csrc/speculative/speculative_sampling.cu"
|
"csrc/speculative/speculative_sampling.cu"
|
||||||
"csrc/memory/store.cu"
|
"csrc/memory/store.cu"
|
||||||
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
|
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
|
||||||
@@ -417,6 +416,18 @@ if (SGL_KERNEL_ENABLE_FA3)
|
|||||||
target_compile_definitions(flash_ops PRIVATE ${FLASH_OPS_COMPILE_DEFS})
|
target_compile_definitions(flash_ops PRIVATE ${FLASH_OPS_COMPILE_DEFS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Build spatial_ops as a separate, optional extension for green contexts
|
||||||
|
set(SPATIAL_SOURCES
|
||||||
|
"csrc/spatial/greenctx_stream.cu"
|
||||||
|
"csrc/spatial_extension.cc"
|
||||||
|
)
|
||||||
|
|
||||||
|
Python_add_library(spatial_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SPATIAL_SOURCES})
|
||||||
|
target_compile_options(spatial_ops PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS}>)
|
||||||
|
target_link_libraries(spatial_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda)
|
||||||
|
install(TARGETS spatial_ops LIBRARY DESTINATION sgl_kernel)
|
||||||
|
|
||||||
|
|
||||||
# ============================ DeepGEMM (JIT) ============================= #
|
# ============================ DeepGEMM (JIT) ============================= #
|
||||||
# Create a separate library for DeepGEMM's Python API.
|
# Create a separate library for DeepGEMM's Python API.
|
||||||
# This keeps its compilation isolated from the main common_ops.
|
# This keeps its compilation isolated from the main common_ops.
|
||||||
|
|||||||
@@ -433,12 +433,6 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
|
|||||||
"qserve_w4a8_per_group_gemm(Tensor _in_feats, Tensor _kernel, Tensor _zeros, Tensor _scales_i8, Tensor _wscales, "
|
"qserve_w4a8_per_group_gemm(Tensor _in_feats, Tensor _kernel, Tensor _zeros, Tensor _scales_i8, Tensor _wscales, "
|
||||||
"Tensor _ascales, Tensor! _out_feats) -> ()");
|
"Tensor _ascales, Tensor! _out_feats) -> ()");
|
||||||
m.impl("qserve_w4a8_per_group_gemm", torch::kCUDA, &qserve_w4a8_per_group_gemm);
|
m.impl("qserve_w4a8_per_group_gemm", torch::kCUDA, &qserve_w4a8_per_group_gemm);
|
||||||
|
|
||||||
/*
|
|
||||||
* From csrc/spatial
|
|
||||||
*/
|
|
||||||
m.def("create_greenctx_stream_by_value(int smA, int smB, int device) -> int[]");
|
|
||||||
m.impl("create_greenctx_stream_by_value", &create_greenctx_stream_by_value);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_EXTENSION(common_ops)
|
REGISTER_EXTENSION(common_ops)
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ static std::vector<int64_t> create_greenctx_stream_direct_dynamic(CUgreenCtx gct
|
|||||||
// This symbol is introduced in CUDA 12.5
|
// This symbol is introduced in CUDA 12.5
|
||||||
const static auto pfn = probe_cuGreenCtxStreamCreate();
|
const static auto pfn = probe_cuGreenCtxStreamCreate();
|
||||||
if (!pfn) {
|
if (!pfn) {
|
||||||
|
TORCH_WARN("cuGreenCtxStreamCreate(cuda>=12.5) is not available, using fallback");
|
||||||
return create_greenctx_stream_fallback(gctx);
|
return create_greenctx_stream_fallback(gctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,17 +56,12 @@ static std::vector<int64_t> create_greenctx_stream_direct_dynamic(CUgreenCtx gct
|
|||||||
std::vector<int64_t> create_greenctx_stream_by_value(int64_t smA, int64_t smB, int64_t device) {
|
std::vector<int64_t> create_greenctx_stream_by_value(int64_t smA, int64_t smB, int64_t device) {
|
||||||
CUDA_DRV(cuDriverGetVersion(&CUDA_DRIVER_VERSION));
|
CUDA_DRV(cuDriverGetVersion(&CUDA_DRIVER_VERSION));
|
||||||
|
|
||||||
if (CUDA_DRIVER_VERSION < 12040) {
|
|
||||||
TORCH_CHECK(false, "Green Contexts feature requires CUDA Toolkit 12.4 or newer.");
|
|
||||||
}
|
|
||||||
|
|
||||||
CUgreenCtx gctx[3];
|
CUgreenCtx gctx[3];
|
||||||
CUdevResourceDesc desc[3];
|
CUdevResourceDesc desc[3];
|
||||||
CUdevResource input;
|
CUdevResource input;
|
||||||
CUdevResource resources[4];
|
CUdevResource resources[4];
|
||||||
if (smA <= 0 || smB <= 0) {
|
|
||||||
TORCH_CHECK(false, "SM counts must be positive");
|
TORCH_CHECK(smA > 0 && smB > 0, "SM counts must be positive");
|
||||||
}
|
|
||||||
|
|
||||||
CUDA_DRV(cuDeviceGetDevResource((CUdevice)device, &input, CU_DEV_RESOURCE_TYPE_SM));
|
CUDA_DRV(cuDeviceGetDevResource((CUdevice)device, &input, CU_DEV_RESOURCE_TYPE_SM));
|
||||||
|
|
||||||
|
|||||||
29
sgl-kernel/csrc/spatial_extension.cc
Normal file
29
sgl-kernel/csrc/spatial_extension.cc
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
/* Copyright 2025 SGLang Team. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <torch/library.h>
|
||||||
|
|
||||||
|
#include "sgl_kernel_ops.h"
|
||||||
|
|
||||||
|
TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
|
||||||
|
/*
|
||||||
|
* From csrc/spatial
|
||||||
|
*/
|
||||||
|
m.def("create_greenctx_stream_by_value(int smA, int smB, int device) -> int[]");
|
||||||
|
m.impl("create_greenctx_stream_by_value", &create_greenctx_stream_by_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_EXTENSION(spatial_ops)
|
||||||
@@ -92,7 +92,20 @@ from sgl_kernel.sampling import (
|
|||||||
top_p_renorm_prob,
|
top_p_renorm_prob,
|
||||||
top_p_sampling_from_probs,
|
top_p_sampling_from_probs,
|
||||||
)
|
)
|
||||||
from sgl_kernel.spatial import create_greenctx_stream_by_value, get_sm_available
|
|
||||||
|
|
||||||
|
def create_greenctx_stream_by_value(*args, **kwargs):
|
||||||
|
from sgl_kernel.spatial import create_greenctx_stream_by_value as _impl
|
||||||
|
|
||||||
|
return _impl(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_sm_available(*args, **kwargs):
|
||||||
|
from sgl_kernel.spatial import get_sm_available as _impl
|
||||||
|
|
||||||
|
return _impl(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
from sgl_kernel.speculative import (
|
from sgl_kernel.speculative import (
|
||||||
build_tree_kernel_efficient,
|
build_tree_kernel_efficient,
|
||||||
segment_packbits,
|
segment_packbits,
|
||||||
|
|||||||
@@ -1,6 +1,17 @@
|
|||||||
import torch
|
import torch
|
||||||
from torch.cuda.streams import ExternalStream
|
from torch.cuda.streams import ExternalStream
|
||||||
|
|
||||||
|
try:
|
||||||
|
from . import spatial_ops # triggers TORCH extension registration
|
||||||
|
except Exception as _e:
|
||||||
|
_spatial_import_error = _e
|
||||||
|
else:
|
||||||
|
_spatial_import_error = None
|
||||||
|
|
||||||
|
_IMPORT_ERROR = ImportError(
|
||||||
|
"Failed to load sgl_kernel.spatial_ops extension. Ensure CUDA Driver >= 12.4"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def create_greenctx_stream_by_value(
|
def create_greenctx_stream_by_value(
|
||||||
SM_a: int, SM_b: int, device_id: int = None
|
SM_a: int, SM_b: int, device_id: int = None
|
||||||
@@ -14,11 +25,8 @@ def create_greenctx_stream_by_value(
|
|||||||
Returns:
|
Returns:
|
||||||
tuple[ExternalStream, ExternalStream]: The two streams.
|
tuple[ExternalStream, ExternalStream]: The two streams.
|
||||||
"""
|
"""
|
||||||
if torch.version.cuda < "12.4":
|
if _spatial_import_error is not None:
|
||||||
raise RuntimeError(
|
raise _IMPORT_ERROR from _spatial_import_error
|
||||||
"Green Contexts feature requires CUDA Toolkit 12.4 or newer."
|
|
||||||
)
|
|
||||||
|
|
||||||
if device_id is None:
|
if device_id is None:
|
||||||
device_id = torch.cuda.current_device()
|
device_id = torch.cuda.current_device()
|
||||||
|
|
||||||
@@ -42,6 +50,8 @@ def get_sm_available(device_id: int = None) -> int:
|
|||||||
Returns:
|
Returns:
|
||||||
int: The SMs available.
|
int: The SMs available.
|
||||||
"""
|
"""
|
||||||
|
if _spatial_import_error is not None:
|
||||||
|
raise _IMPORT_ERROR from _spatial_import_error
|
||||||
if device_id is None:
|
if device_id is None:
|
||||||
device_id = torch.cuda.current_device()
|
device_id = torch.cuda.current_device()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user