fix: sgl-kernel link cuda (#2906)
This commit is contained in:
@@ -11,6 +11,8 @@ docker run --rm \
|
||||
${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir torch==2.4.0 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.} && \
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' && \
|
||||
export CUDA_VERSION=${CUDA_VERSION} && \
|
||||
mkdir -p /usr/lib/x86_64-linux-gnu/ && \
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \
|
||||
cd /sgl-kernel && \
|
||||
${PYTHON_ROOT_PATH}/bin/python setup.py bdist_wheel
|
||||
"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "sgl-kernel"
|
||||
version = "0.0.2.post13"
|
||||
version = "0.0.2.post14"
|
||||
description = "Kernel Library for SGLang"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
|
||||
@@ -41,7 +41,7 @@ nvcc_flags = [
|
||||
]
|
||||
cxx_flags = ["-O3"]
|
||||
libraries = ["c10", "torch", "torch_python", "cuda"]
|
||||
extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib"]
|
||||
extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", "-L/usr/lib/x86_64-linux-gnu"]
|
||||
ext_modules = [
|
||||
CUDAExtension(
|
||||
name="sgl_kernel.ops._kernels",
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <THC/THCAtomics.cuh>
|
||||
|
||||
#include "utils.hpp"
|
||||
#include "vectorization.cuh"
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void sampling_scaling_penalties_kernel(
|
||||
const scalar_t* logits,
|
||||
const scalar_t* scaling_penalties,
|
||||
scalar_t* output,
|
||||
const int32_t numel) {
|
||||
|
||||
__global__ void sampling_scaling_penalties_kernel(const scalar_t* logits, const scalar_t* scaling_penalties,
|
||||
scalar_t* output, const int32_t numel) {
|
||||
const int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int32_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
@@ -50,14 +48,11 @@ torch::Tensor sampling_scaling_penalties(const torch::Tensor& logits, const torc
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
|
||||
logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
at::ScalarType::Half, at::ScalarType::BFloat16, logits.scalar_type(), "sampling_scaling_penalties_kernel", ([&] {
|
||||
const int blocks = (numel + threads * 4 - 1) / (threads * 4);
|
||||
sampling_scaling_penalties_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||
logits.data_ptr<scalar_t>(),
|
||||
scaling_penalties.data_ptr<scalar_t>(),
|
||||
output.data_ptr<scalar_t>(),
|
||||
numel);
|
||||
logits.data_ptr<scalar_t>(), scaling_penalties.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), numel);
|
||||
}));
|
||||
|
||||
return output;
|
||||
|
||||
@@ -6,8 +6,8 @@
|
||||
|
||||
// Include both AMD and NVIDIA fp8 types to avoid circular import
|
||||
// TODO(luka/varun) use FP8_TYPE instead after refactoring
|
||||
#include <c10/util/Float8_e4m3fnuz.h>
|
||||
#include <c10/util/Float8_e4m3fn.h>
|
||||
#include <c10/util/Float8_e4m3fnuz.h>
|
||||
|
||||
// Vectorization containers
|
||||
template <typename scalar_t>
|
||||
@@ -20,8 +20,7 @@ struct __align__(8) vec4_t {
|
||||
|
||||
template <typename quant_type_t>
|
||||
struct __align__(4) q8x4_t {
|
||||
static_assert(std::is_same_v<quant_type_t, int8_t> ||
|
||||
std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
|
||||
static_assert(std::is_same_v<quant_type_t, int8_t> || std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
|
||||
std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
|
||||
quant_type_t x;
|
||||
quant_type_t y;
|
||||
|
||||
Reference in New Issue
Block a user