From 9602c2aac76d2655d4d9aa657e60accde1cfb51f Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 31 Jan 2025 00:39:47 +0800 Subject: [PATCH] keep the parts needed for moe_kernels (#3218) --- .../tensorrt_llm/common/CMakeLists.txt | 22 - .../3rdparty/tensorrt_llm/common/assert.cpp | 0 .../3rdparty/tensorrt_llm/common/assert.h | 92 ++ .../tensorrt_llm/common/cudaDriverWrapper.cpp | 187 ++++ .../tensorrt_llm/common/cudaDriverWrapper.h | 138 +++ .../tensorrt_llm/common/cudaFp8Utils.h | 239 +++++ .../tensorrt_llm/common/cudaProfilerUtils.cpp | 84 -- .../3rdparty/tensorrt_llm/common/cudaUtils.h | 641 +++++++++++++ .../common/customAllReduceUtils.h | 36 - .../3rdparty/tensorrt_llm/common/envUtils.cpp | 214 ----- .../3rdparty/tensorrt_llm/common/envUtils.h | 60 -- .../3rdparty/tensorrt_llm/common/logger.h | 190 ++++ .../3rdparty/tensorrt_llm/common/mathUtils.h | 37 - .../tensorrt_llm/common/memoryUtils.cu | 906 ------------------ .../tensorrt_llm/common/memoryUtils.h | 292 ------ .../3rdparty/tensorrt_llm/common/mpiUtils.cpp | 588 ------------ .../3rdparty/tensorrt_llm/common/nvtxUtils.h | 46 - .../3rdparty/tensorrt_llm/common/opUtils.cpp | 323 ------- .../3rdparty/tensorrt_llm/common/opUtils.h | 215 ----- .../tensorrt_llm/common/quantization.h | 358 +++++++ .../3rdparty/tensorrt_llm/common/stlUtils.h | 123 --- .../tensorrt_llm/common/stringUtils.h | 113 +++ .../tensorrt_llm/common/timestampUtils.cpp | 42 - .../{timestampUtils.h => tllmException.h} | 27 +- 24 files changed, 1983 insertions(+), 2990 deletions(-) delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt mode change 100755 => 100644 sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/assert.h create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/logger.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp rename sgl-kernel/3rdparty/tensorrt_llm/common/{timestampUtils.h => tllmException.h} (50%) diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt b/sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt deleted file mode 100644 index e479b298d..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & -# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. -# -file(GLOB SRCS *.cpp) -file(GLOB CU_SRCS *.cu) - -add_library(common_src OBJECT ${SRCS} ${CU_SRCS}) -set_property(TARGET common_src PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET common_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp old mode 100755 new mode 100644 diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h new file mode 100644 index 000000000..7f51dbf1b --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/common/stringUtils.h" +#include "tensorrt_llm/common/tllmException.h" + +#include + +namespace tensorrt_llm::common +{ +[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") +{ + throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str())); +} + +} // namespace tensorrt_llm::common + +class DebugConfig +{ +public: + static bool isCheckDebugEnabled(); +}; + +#if defined(_WIN32) +#define TLLM_LIKELY(x) (__assume((x) == 1), (x)) +#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x)) +#else +#define TLLM_LIKELY(x) __builtin_expect((x), 1) +#define TLLM_UNLIKELY(x) __builtin_expect((x), 0) +#endif + +#define TLLM_CHECK(val) \ + do \ + { \ + TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ + : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + } while (0) + +#define TLLM_CHECK_WITH_INFO(val, info, ...) \ + do \ + { \ + TLLM_LIKELY(static_cast(val)) \ + ? ((void) 0) \ + : tensorrt_llm::common::throwRuntimeError( \ + __FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__)); \ + } while (0) + +#define TLLM_CHECK_DEBUG(val) \ + do \ + { \ + if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ + { \ + TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ + : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + } \ + } while (0) + +#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \ + do \ + { \ + if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ + { \ + TLLM_LIKELY(static_cast(val)) \ + ? ((void) 0) \ + : tensorrt_llm::common::throwRuntimeError( \ + __FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__)); \ + } \ + } while (0) + +#define TLLM_THROW(...) \ + do \ + { \ + throw NEW_TLLM_EXCEPTION(__VA_ARGS__); \ + } while (0) + +#define TLLM_WRAP(ex) \ + NEW_TLLM_EXCEPTION("%s: %s", tensorrt_llm::common::TllmException::demangle(typeid(ex).name()).c_str(), ex.what()) diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp new file mode 100644 index 000000000..7eca46a1c --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define CUDA_LIB_NAME "cuda" + +#if defined(_WIN32) +#include +#define dllOpen(name) LoadLibrary("nv" name ".dll") +#define dllClose(handle) FreeLibrary(static_cast(handle)) +#define dllGetSym(handle, name) static_cast(GetProcAddress(static_cast(handle), name)) +#else // For non-Windows platforms +#include +#define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY) +#define dllClose(handle) dlclose(handle) +#define dllGetSym(handle, name) dlsym(handle, name) +#endif // defined(_WIN32) + +#include "cudaDriverWrapper.h" +#include "tensorrt_llm/common/assert.h" +#include +#include + +namespace tensorrt_llm::common +{ + +std::shared_ptr CUDADriverWrapper::getInstance() +{ + static std::mutex mutex; + static std::weak_ptr instance; + std::shared_ptr result = instance.lock(); + if (result) + { + return result; + } + + std::lock_guard lock(mutex); + result = instance.lock(); + if (!result) + { + result = std::shared_ptr(new CUDADriverWrapper()); + instance = result; + } + return result; +} + +CUDADriverWrapper::CUDADriverWrapper() + : handle(dllOpen(CUDA_LIB_NAME)) +{ + + TLLM_CHECK_WITH_INFO(handle != nullptr, "CUDA driver library is not open correctly."); + + auto load_sym = [](void* handle, char const* name) + { + void* ret = dllGetSym(handle, name); + return ret; + }; + + *reinterpret_cast(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName"); + *reinterpret_cast(&_cuGetErrorMessage) = load_sym(handle, "cuGetErrorMessage"); + *reinterpret_cast(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute"); + *reinterpret_cast(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete"); + *reinterpret_cast(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload"); + *reinterpret_cast(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy"); + *reinterpret_cast(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData"); + *reinterpret_cast(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2"); + *reinterpret_cast(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction"); + *reinterpret_cast(&_cuModuleGetGlobal) = load_sym(handle, "cuModuleGetGlobal_v2"); + *reinterpret_cast(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2"); + *reinterpret_cast(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2"); + *reinterpret_cast(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel"); + *reinterpret_cast(&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel"); + *reinterpret_cast(&_cuTensorMapEncodeTiled) = load_sym(handle, "cuTensorMapEncodeTiled"); + *reinterpret_cast(&_cuMemcpyDtoH) = load_sym(handle, "cuMemcpyDtoH_v2"); +} + +CUDADriverWrapper::~CUDADriverWrapper() +{ + dllClose(handle); +} + +CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, char const** pStr) const +{ + return (*_cuGetErrorName)(error, pStr); +} + +CUresult CUDADriverWrapper::cuGetErrorMessage(CUresult error, char const** pStr) const +{ + return (*_cuGetErrorMessage)(error, pStr); +} + +CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const +{ + return (*_cuFuncSetAttribute)(hfunc, attrib, value); +} + +CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const +{ + return (*_cuLinkComplete)(state, cubinOut, sizeOut); +} + +CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const +{ + return (*_cuModuleUnload)(hmod); +} + +CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const +{ + return (*_cuLinkDestroy)(state); +} + +CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, void const* image) const +{ + return (*_cuModuleLoadData)(module, image); +} + +CUresult CUDADriverWrapper::cuLinkCreate( + unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const +{ + return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut); +} + +CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, char const* name) const +{ + return (*_cuModuleGetFunction)(hfunc, hmod, name); +} + +CUresult CUDADriverWrapper::cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, char const* name) const +{ + return (*_cuModuleGetGlobal)(dptr, bytes, hmod, name); +} + +CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, char const* path, + unsigned int numOptions, CUjit_option* options, void** optionValues) const +{ + return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues); +} + +CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, + char const* name, unsigned int numOptions, CUjit_option* options, void** optionValues) const +{ + return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues); +} + +CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const +{ + return (*_cuLaunchCooperativeKernel)( + f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams); +} + +CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) const +{ + return (*_cuLaunchKernel)( + f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); +} + +CUresult CUDADriverWrapper::cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType, + cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides, + cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, + CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const +{ + return (*_cuTensorMapEncodeTiled)(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, globalStrides, + boxDim, elementStrides, interleave, swizzle, l2Promotion, oobFill); +} + +CUresult CUDADriverWrapper::cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const +{ + return (*_cuMemcpyDtoH)(dstHost, srcDevice, ByteCount); +} + +} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h new file mode 100644 index 000000000..c4d470a85 --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUDA_DRIVER_WRAPPER_H +#define CUDA_DRIVER_WRAPPER_H + +#include "tensorrt_llm/common/assert.h" +#include +#include +#include +#include + +namespace tensorrt_llm::common +{ + +class CUDADriverWrapper +{ +public: + static std::shared_ptr getInstance(); + + ~CUDADriverWrapper(); + CUDADriverWrapper(CUDADriverWrapper const&) = delete; + CUDADriverWrapper operator=(CUDADriverWrapper const&) = delete; + CUDADriverWrapper(CUDADriverWrapper&&) = delete; + CUDADriverWrapper operator=(CUDADriverWrapper&&) = delete; + + CUresult cuGetErrorName(CUresult error, char const** pStr) const; + + CUresult cuGetErrorMessage(CUresult error, char const** pStr) const; + + CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const; + + CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const; + + CUresult cuModuleUnload(CUmodule hmod) const; + + CUresult cuLinkDestroy(CUlinkState state) const; + + CUresult cuModuleLoadData(CUmodule* module, void const* image) const; + + CUresult cuLinkCreate( + unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const; + + CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, char const* name) const; + + CUresult cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, char const* name) const; + + CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, char const* path, unsigned int numOptions, + CUjit_option* options, void** optionValues) const; + + CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, char const* name, + unsigned int numOptions, CUjit_option* options, void** optionValues) const; + + CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const; + + CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, + CUstream hStream, void** kernelParams, void** extra) const; + + CUresult cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, + void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides, cuuint32_t const* boxDim, + cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, + CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const; + + CUresult cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const; + +private: + void* handle; + CUDADriverWrapper(); + + CUresult (*_cuGetErrorName)(CUresult, char const**); + CUresult (*_cuGetErrorMessage)(CUresult, char const**); + CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int); + CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*); + CUresult (*_cuModuleUnload)(CUmodule); + CUresult (*_cuLinkDestroy)(CUlinkState); + CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*); + CUresult (*_cuModuleLoadData)(CUmodule*, void const*); + CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, char const*); + CUresult (*_cuModuleGetGlobal)(CUdeviceptr*, size_t*, CUmodule, char const*); + CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, char const*, unsigned int, CUjit_option*, void**); + CUresult (*_cuLinkAddData)( + CUlinkState, CUjitInputType, void*, size_t, char const*, unsigned int, CUjit_option*, void**); + CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, + unsigned int, unsigned int, unsigned int, CUstream, void**); + CUresult (*_cuLaunchKernel)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, + CUstream hStream, void** kernelParams, void** extra); + CUresult (*_cuTensorMapEncodeTiled)(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType, + cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides, + cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, + CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill); + CUresult (*_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +}; + +template +void checkDriver( + T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file, int const line) +{ + if (result) + { + char const* errorName = nullptr; + char const* errorMsg = nullptr; + wrap.cuGetErrorName(result, &errorName); + wrap.cuGetErrorMessage(result, &errorMsg); + throw TllmException( + file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s", func, errorName, errorMsg)); + } +} + +} // namespace tensorrt_llm::common + +/* + * Macros compliant with TensorRT coding conventions + */ +#define TLLM_CU_CHECK(stat) \ + do \ + { \ + tensorrt_llm::common::checkDriver( \ + (stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__); \ + } while (0) + +#endif // CUDA_DRIVER_WRAPPER_H diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h new file mode 100644 index 000000000..aa93b55a5 --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef ENABLE_FP8 +#include +#include +#include + +#define FP8_MHA +#define FUSE_GEMM_ACT +#define FP8_GEMM_OUTPUT_QUANT_DISABLE + +#ifdef FUSE_GEMM_ACT +#define USE_QGMMA +#endif + +namespace tensorrt_llm +{ +namespace common +{ + +constexpr float FP8_E4M3_MAX = 448.0f; + +enum QuantizeMode +{ + PER_CHANNEL, + PER_TENSOR, + PER_CHANNEL_WEIGHT_PER_TENSOR_ACT, + PER_TOKEN, +}; + +// Packed Data Type +typedef struct __CUDA_ALIGN__(32) +{ + float array[8]; +} float8; + +typedef struct __CUDA_ALIGN__(16) +{ + half array[8]; +} half8; + +typedef struct __CUDA_ALIGN__(8) +{ + half2 array[2]; +} half2_2; + +typedef struct __CUDA_ALIGN__(8) +{ + half array[4]; +} half_4; + +#ifdef ENABLE_BF16 +typedef struct __CUDA_ALIGN__(4) +{ + __nv_bfloat16 array[2]; +} __nv_bfloat16_2; + +typedef struct __CUDA_ALIGN__(8) +{ + __nv_bfloat162 x, y; +} __nv_bfloat162_2_xy; + +typedef struct __CUDA_ALIGN__(8) +{ + __nv_bfloat16 array[4]; +} __nv_bfloat164; + +typedef struct __CUDA_ALIGN__(8) +{ + __nv_bfloat162 array[2]; +} __nv_bfloat162_2; + +typedef struct __CUDA_ALIGN__(16) +{ + __nv_bfloat16 array[8]; +} __nv_bfloat168; + +typedef struct __CUDA_ALIGN__(16) +{ + __nv_bfloat162 array[4]; +} __nv_bfloat162_4; + +typedef struct __CUDA_ALIGN__(32) +{ + __nv_bfloat16 array[16]; +} __nv_bfloat1616; +#endif + +#ifdef ENABLE_FP8 +typedef struct __CUDA_ALIGN__(2) +{ + __nv_fp8_e4m3 array[2]; +} __nv_fp8_2_e4m3; + +typedef struct __CUDA_ALIGN__(4) +{ + __nv_fp8_e4m3 array[4]; +} __nv_fp8_4_e4m3; + +typedef struct __CUDA_ALIGN__(4) +{ + __nv_fp8x2_e4m3 array[2]; +} __nv_fp8x2_x2_e4m3; + +typedef struct __CUDA_ALIGN__(8) +{ + __nv_fp8_e4m3 array[8]; +} __nv_fp8_8_e4m3; + +typedef struct __CUDA_ALIGN__(8) +{ + __nv_fp8x2_e4m3 array[4]; +} __nv_fp8x2_x4_e4m3; + +typedef struct __CUDA_ALIGN__(16) +{ + __nv_fp8_e4m3 array[16]; +} __nv_fp8x16_e4m3; +#endif + +// only BF16 and FP8 +template +struct PackType +{ + using type = float; +}; + +#ifdef ENABLE_BF16 +template <> +struct PackType<__nv_bfloat16, 2> +{ + using type = __nv_bfloat16_2; +}; + +template <> +struct PackType<__nv_bfloat16, 4> +{ + using type = __nv_bfloat164; +}; + +template <> +struct PackType<__nv_bfloat16, 8> +{ + using type = __nv_bfloat168; +}; +#endif + +#ifdef ENABLE_FP8 +template <> +struct PackType<__nv_fp8_e4m3, 2> +{ + using type = __nv_fp8_2_e4m3; +}; + +template <> +struct PackType<__nv_fp8_e4m3, 4> +{ + using type = __nv_fp8_4_e4m3; +}; + +template <> +struct PackType<__nv_fp8_e4m3, 8> +{ + using type = __nv_fp8_8_e4m3; +}; +#endif + +__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, __nv_fp8x4_e4m3 const* in) +{ + const char4 tmp_val = reinterpret_cast(in)[0]; + *out1 = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0], + (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]); + *out2 = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.z)[0], + (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.w)[0]); +} + +__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(__nv_fp8x2_e4m3 const* in) +{ + const char2 tmp_val = reinterpret_cast(in)[0]; + __nv_bfloat162 out = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0], + (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]); + return out; +} + +__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, __nv_fp8x4_e4m3 const* in) +{ + const char4 tmp_val = reinterpret_cast(in)[0]; + *out1 = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0], + (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]); + *out2 = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.z)[0], + (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.w)[0]); +} + +__inline__ __device__ half2 fp8x2_e4m3_to_half2(__nv_fp8x2_e4m3 const* in) +{ + const char2 tmp_val = reinterpret_cast(in)[0]; + half2 out = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0], + (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]); + return out; +} + +template +void invokeQuantizeMatrix(T_OUT* output, T_S const* input_qua_amax_ptr, T_IN const* input, int64_t numel, int64_t lda, + QuantizeMode quantize_mode, cudaStream_t stream); + +template +void invokeDequantizeMatrix(T_OUT* output, T_S const* input_qua_amax_ptr, T_IN const* input, int64_t numel, int64_t lda, + QuantizeMode quantize_mode, cudaStream_t stream); + +template +void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int64_t numel, cudaStream_t stream); + +template +void invokeComputeFP8QuantizeScale(T_S* quant_ptr, const T_W* weights, const int64_t k, const int64_t lda, + QuantizeMode quantize_mode, cudaStream_t stream); + +template +void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T_IN* weights, const int64_t numel, + const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream); + +} // namespace common +} // namespace tensorrt_llm +#endif // ENABLE_FP8 diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp deleted file mode 100644 index 5576fe782..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "tensorrt_llm/common/cudaProfilerUtils.h" -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/common/stringUtils.h" -#include -#include - -namespace -{ - -std::tuple, std::unordered_set> populateIterationIndexesImpl( - std::string const& envVarName) -{ - auto envVarVal = std::getenv(envVarName.c_str()); - auto envVarValStr = std::string{envVarVal != nullptr ? envVarVal : ""}; - auto values = tensorrt_llm::common::str2set(envVarValStr, ','); - std::unordered_set startSet; - std::unordered_set endSet; - for (std::string const& value : values) - { - size_t dashIdx = value.find("-"); - if (dashIdx != std::string::npos) - { - int32_t start = std::stoi(value.substr(0, dashIdx)); - startSet.insert(start); - int32_t end = std::stoi(value.substr(dashIdx + 1)); - endSet.insert(end); - } - else - { - int32_t start_end = std::stoi(value); - startSet.insert(start_end); - endSet.insert(start_end); - } - } - - return std::make_pair(startSet, endSet); -} - -} // namespace - -namespace tensorrt_llm::common -{ - -std::pair, std::unordered_set> populateIterationIndexes( - std::string const& envVarName, std::optional const& legacyEnvVarName) -{ - auto [profileIterIdxs, stopIterIdxs] = populateIterationIndexesImpl(envVarName); - - // If empty, try to use legacy env var name - if (legacyEnvVarName && profileIterIdxs.empty() && stopIterIdxs.empty()) - { - std::tie(profileIterIdxs, stopIterIdxs) = populateIterationIndexesImpl(legacyEnvVarName.value()); - - if (!profileIterIdxs.empty() || !stopIterIdxs.empty()) - { - TLLM_LOG_WARNING( - "Using deprecated environment variable %s to specify cudaProfiler start and stop iterations. " - "Please " - "use %s " - "instead.", - legacyEnvVarName.value().c_str(), envVarName.c_str()); - } - } - - return std::make_pair(profileIterIdxs, stopIterIdxs); -} - -} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h new file mode 100644 index 000000000..13ee3367e --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h @@ -0,0 +1,641 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/common/cudaBf16Wrapper.h" +#include "tensorrt_llm/common/cudaDriverWrapper.h" +#include "tensorrt_llm/common/cudaFp8Utils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/tllmException.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 // Linux +#include +#endif // not WIN32 +#include +#ifdef _WIN32 // Windows +#include +#undef ERROR // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without + // this undef. +#endif // WIN32 + +namespace tensorrt_llm::common +{ + +// workspace for cublas gemm : 32MB +#define CUBLAS_WORKSPACE_SIZE 33554432 + +typedef struct __align__(4) +{ + half x, y, z, w; +} + +half4; + +/* **************************** type definition ***************************** */ + +enum CublasDataType +{ + FLOAT_DATATYPE = 0, + HALF_DATATYPE = 1, + BFLOAT16_DATATYPE = 2, + INT8_DATATYPE = 3, + FP8_DATATYPE = 4 +}; + +enum TRTLLMCudaDataType +{ + FP32 = 0, + FP16 = 1, + BF16 = 2, + INT8 = 3, + FP8 = 4 +}; + +enum class OperationType +{ + FP32, + FP16, + BF16, + INT8, + FP8 +}; + +/* **************************** debug tools ********************************* */ +static char const* _cudaGetErrorEnum(cudaError_t error) +{ + return cudaGetErrorString(error); +} + +static char const* _cudaGetErrorEnum(cublasStatus_t error) +{ + switch (error) + { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + } + return ""; +} + +template +void check(T result, char const* const func, char const* const file, int const line) +{ + if (result) + { + throw TllmException( + file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result))); + } +} + +template +void checkEx(T result, std::initializer_list const& validReturns, char const* const func, char const* const file, + int const line) +{ + if (std::all_of(std::begin(validReturns), std::end(validReturns), [&result](T const& t) { return t != result; })) + { + throw TllmException( + file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result))); + } +} + +#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__) +#define check_cuda_error_2(val, file, line) check((val), #val, file, line) + +inline std::optional isCudaLaunchBlocking() +{ + static bool firstCall = true; + static std::optional result = std::nullopt; + + if (firstCall) + { + char const* env = std::getenv("CUDA_LAUNCH_BLOCKING"); + if (env != nullptr && std::string(env) == "1") + { + result = true; + } + else if (env != nullptr && std::string(env) == "0") + { + result = false; + } + firstCall = false; + } + + return result; +} + +inline bool doCheckError() +{ + auto const cudaLaunchBlocking = isCudaLaunchBlocking(); +#ifndef NDEBUG + bool const checkError = cudaLaunchBlocking.value_or(true); +#else + bool const checkError = cudaLaunchBlocking.value_or(false); +#endif + + return checkError; +} + +inline void syncAndCheck(char const* const file, int const line) +{ + if (doCheckError()) + { + cudaDeviceSynchronize(); + check(cudaGetLastError(), "cudaGetLastError", file, line); + } +} + +#define sync_check_cuda_error() tensorrt_llm::common::syncAndCheck(__FILE__, __LINE__) + +#define PRINT_FUNC_NAME_() \ + do \ + { \ + std::cout << "[TensorRT-LLM][CALL] " << __FUNCTION__ << " " << std::endl; \ + } while (0) + +// clang-format off +template struct packed_type; +template <> struct packed_type { using type = float; }; // we don't need to pack float by default +template <> struct packed_type { using type = half2; }; + +#ifdef ENABLE_BF16 +template<> +struct packed_type<__nv_bfloat16> { + using type = __nv_bfloat162; +}; +#endif + +#ifdef ENABLE_FP8 +template<> +struct packed_type<__nv_fp8_e4m3> { + using type = __nv_fp8x2_e4m3; +}; +#endif + +template struct num_elems; +template <> struct num_elems { static constexpr int value = 1; }; +template <> struct num_elems { static constexpr int value = 2; }; +template <> struct num_elems { static constexpr int value = 4; }; +template <> struct num_elems { static constexpr int value = 1; }; +template <> struct num_elems { static constexpr int value = 2; }; +#ifdef ENABLE_BF16 +template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; }; +template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; }; +#endif +#ifdef ENABLE_FP8 +template <> struct num_elems<__nv_fp8_e4m3> { static constexpr int value = 1; }; +template <> struct num_elems<__nv_fp8x2_e4m3> { static constexpr int value = 2; }; +#endif + +template struct packed_as; +template struct packed_as { using type = T; }; +template<> struct packed_as { using type = half2; }; +template<> struct packed_as { using type = float2; }; +template<> struct packed_as { using type = int16_t; }; +template<> struct packed_as { using type = int2; }; +template<> struct packed_as { using type = half; }; +template<> struct packed_as { using type = float; }; +#ifdef ENABLE_BF16 +template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; }; +template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; }; +#endif +#ifdef ENABLE_FP8 +template<> struct packed_as<__nv_fp8_e4m3, 2> { using type = __nv_fp8x2_e4m3; }; +template<> struct packed_as<__nv_fp8x2_e4m3, 1> { using type = __nv_fp8_e4m3; }; +template<> struct packed_as<__nv_fp8_e5m2, 2> { using type = __nv_fp8x2_e5m2; }; +template<> struct packed_as<__nv_fp8x2_e5m2, 1> { using type = __nv_fp8_e5m2; }; +#endif + +inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } +inline __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); } +inline __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } + +inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); } +inline __device__ float2 operator+(float2 a, float b) { return make_float2(a.x + b, a.y + b); } +inline __device__ float2 operator-(float2 a, float b) { return make_float2(a.x - b, a.y - b); } + +// clang-format on + +template +struct CudaDataType +{ +}; + +template <> +struct CudaDataType +{ + static constexpr cudaDataType_t value = cudaDataType::CUDA_R_32F; +}; + +template <> +struct CudaDataType +{ + static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16F; +}; + +#ifdef ENABLE_BF16 +template <> +struct CudaDataType<__nv_bfloat16> +{ + static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16BF; +}; +#endif + +inline int getSMVersion() +{ + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + int sm_major = 0; + int sm_minor = 0; + check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device)); + check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); + return sm_major * 10 + sm_minor; +} + +inline int getDevice() +{ + int current_dev_id = 0; + check_cuda_error(cudaGetDevice(¤t_dev_id)); + return current_dev_id; +} + +inline int getDeviceCount() +{ + int count = 0; + check_cuda_error(cudaGetDeviceCount(&count)); + return count; +} + +/// @brief Identifies the memory type of the given pointer. +template +cudaMemoryType getPtrCudaMemoryType(T* ptr) +{ + cudaPointerAttributes attributes{}; + check_cuda_error(cudaPointerGetAttributes(&attributes, ptr)); + return attributes.type; +} + +/// Get the memory info +/// \return The free and total amount of memory in bytes +inline std::tuple getDeviceMemoryInfo(bool const useUvm) +{ + if (useUvm) + { + size_t freeSysMem = 0; + size_t totalSysMem = 0; +#ifndef _WIN32 // Linux + struct sysinfo info + { + }; + + sysinfo(&info); + totalSysMem = info.totalram * info.mem_unit; + freeSysMem = info.freeram * info.mem_unit; +#else // Windows + MEMORYSTATUSEX memInfo; + memInfo.dwLength = sizeof(memInfo); + GlobalMemoryStatusEx(&memInfo); + totalSysMem = memInfo.ullTotalPhys; + freeSysMem = memInfo.ullAvailPhys; +#endif // WIN32 + + TLLM_LOG_INFO("Using UVM based system memory for KV cache, total memory %0.2f GB, available memory %0.2f GB", + ((double) totalSysMem / 1e9), ((double) freeSysMem / 1e9)); + return {freeSysMem, totalSysMem}; + } + + size_t free = 0; + size_t total = 0; + check_cuda_error(cudaMemGetInfo(&free, &total)); + TLLM_LOG_DEBUG("Using GPU memory for KV cache, total memory %0.2f GB, available memory %0.2f GB", + ((double) total / 1e9), ((double) free / 1e9)); + return {free, total}; +} + +/// @brief Gets the memory allocation granularity for the current device. +/// +/// @return size_t The size of the smallest difference in memory size supported by the current device. +inline size_t getAllocationGranularity() +{ + auto const currentDevice = getDevice(); + ::CUmemAllocationProp prop = {}; + + prop.type = ::CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = ::CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = currentDevice; + prop.requestedHandleTypes = ::CU_MEM_HANDLE_TYPE_NONE; + + // Get the minimum granularity supported for allocation with cuMemCreate() + size_t granularity = 0; + TLLM_CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + return granularity; +} + +inline int getMultiProcessorCount() +{ + int device_id = 0; + int multi_processor_count = 0; + check_cuda_error(cudaGetDevice(&device_id)); + check_cuda_error(cudaDeviceGetAttribute(&multi_processor_count, cudaDevAttrMultiProcessorCount, device_id)); + return multi_processor_count; +} + +inline int getMaxSharedMemoryPerBlockOptin() +{ + int device_id = 0; + int max_shared_memory_per_block = 0; + check_cuda_error(cudaGetDevice(&device_id)); + check_cuda_error( + cudaDeviceGetAttribute(&max_shared_memory_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id)); + return max_shared_memory_per_block; +} + +template +inline size_t divUp(const T1& a, const T2& n) +{ + auto const tmp_a = static_cast(a); + auto const tmp_n = static_cast(n); + return (tmp_a + tmp_n - 1) / tmp_n; +} + +inline int roundUp(int a, int n) +{ + return divUp(a, n) * n; +} + +template ::value>, + typename = std::enable_if_t::value>> +auto constexpr ceilDiv(T numerator, U denominator) +{ + return (numerator + denominator - 1) / denominator; +} + +template +void printAbsMean(T const* buf, uint64_t size, cudaStream_t stream, std::string name = "") +{ + if (buf == nullptr) + { + TLLM_LOG_WARNING("%s is an nullptr, skip!", name.c_str()); + return; + } + cudaDeviceSynchronize(); + check_cuda_error(cudaGetLastError()); + T* h_tmp = new T[size]; + cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream); + cudaDeviceSynchronize(); + check_cuda_error(cudaGetLastError()); + double sum = 0.0f; + uint64_t zero_count = 0; + float max_val = -1e10; + bool find_inf = false; + for (uint64_t i = 0; i < size; i++) + { + if (std::isinf((float) (h_tmp[i]))) + { + find_inf = true; + continue; + } + sum += abs((double) h_tmp[i]); + if ((float) h_tmp[i] == 0.0f) + { + zero_count++; + } + max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i])); + } + TLLM_LOG_INFO("%20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s", name.c_str(), size, sum / size, + sum, max_val, find_inf ? "true" : "false"); + delete[] h_tmp; + cudaDeviceSynchronize(); + check_cuda_error(cudaGetLastError()); +} + +template +void printToStream(T const* result, int const size, FILE* strm) +{ + bool const split_rows = (strm == stdout); + if (result == nullptr) + { + TLLM_LOG_WARNING("It is an nullptr, skip! \n"); + return; + } + T* tmp = reinterpret_cast(malloc(sizeof(T) * size)); + check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost)); + for (int i = 0; i < size; ++i) + { + fprintf(strm, "%f, ", static_cast(tmp[i])); + if (split_rows && ((i + 1) % 10) == 0) + fprintf(strm, "\n"); + } + if (!split_rows || (size % 10) != 0) + { + fprintf(strm, "\n"); + } + free(tmp); +} + +template +void printToScreen(T const* result, int const size) +{ + printToStream(result, size, stdout); +} + +template +void print2dToStream(T const* result, int const r, int const c, int const stride, FILE* strm) +{ + if (result == nullptr) + { + TLLM_LOG_WARNING("It is an nullptr, skip! \n"); + return; + } + for (int ri = 0; ri < r; ++ri) + { + T const* ptr = result + ri * stride; + printToStream(ptr, c, strm); + } + fprintf(strm, "\n"); +} + +template +void print2dToScreen(T const* result, int const r, int const c, int const stride) +{ + print2dToStream(result, r, c, stride, stdout); +} + +template +void print2dToFile(std::string fname, T const* result, int const r, int const c, int const stride) +{ + FILE* fp = fopen(fname.c_str(), "wt"); + if (fp != nullptr) + { + print2dToStream(result, r, c, stride, fp); + fclose(fp); + } +} + +inline void print_float_(float x) +{ + printf("%7.3f ", x); +} + +inline void print_element_(float x) +{ + print_float_(x); +} + +inline void print_element_(half x) +{ + print_float_((float) x); +} + +#ifdef ENABLE_BF16 +inline void print_element_(__nv_bfloat16 x) +{ + print_float_((float) x); +} +#endif + +#ifdef ENABLE_FP8 +inline void print_element_(__nv_fp8_e4m3 x) +{ + print_float_((float) x); +} +#endif + +inline void print_element_(uint32_t ul) +{ + printf("%7" PRIu32, ul); +} + +inline void print_element_(uint64_t ull) +{ + printf("%7" PRIu64, ull); +} + +inline void print_element_(int32_t il) +{ + printf("%7" PRId32, il); +} + +inline void print_element_(int64_t ill) +{ + printf("%7" PRId64, ill); +} + +template +inline void printMatrix(T const* ptr, int m, int k, int stride, bool is_device_ptr) +{ + T* tmp; + if (is_device_ptr) + { + // k < stride ; stride = col-dimension. + tmp = reinterpret_cast(malloc(m * stride * sizeof(T))); + check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost)); + cudaDeviceSynchronize(); + } + else + { + tmp = const_cast(ptr); + } + + for (int ii = -1; ii < m; ++ii) + { + if (ii >= 0) + { + printf("%07d ", ii); + } + else + { + printf(" "); + } + + for (int jj = 0; jj < k; jj += 1) + { + if (ii >= 0) + { + print_element_(tmp[ii * stride + jj]); + } + else + { + printf("%7d ", jj); + } + } + printf("\n"); + } + if (is_device_ptr) + { + free(tmp); + } +} + +template void printMatrix(float const* ptr, int m, int k, int stride, bool is_device_ptr); +template void printMatrix(half const* ptr, int m, int k, int stride, bool is_device_ptr); +#ifdef ENABLE_BF16 +template void printMatrix(__nv_bfloat16 const* ptr, int m, int k, int stride, bool is_device_ptr); +#endif +#ifdef ENABLE_FP8 +template void printMatrix(__nv_fp8_e4m3 const* ptr, int m, int k, int stride, bool is_device_ptr); +#endif +template void printMatrix(uint32_t const* ptr, int m, int k, int stride, bool is_device_ptr); +template void printMatrix(uint64_t const* ptr, int m, int k, int stride, bool is_device_ptr); +template void printMatrix(int const* ptr, int m, int k, int stride, bool is_device_ptr); + +} // namespace tensorrt_llm::common + +/* + * Macros compliant with TensorRT coding conventions + */ +#define TLLM_CUDA_CHECK(stat) \ + do \ + { \ + tensorrt_llm::common::check((stat), #stat, __FILE__, __LINE__); \ + } while (0) + +// We use singleton memory pool and the order of destructors depends on the compiler implementation. We find that the +// cudaFree/cudaFreeHost is called after cudaruntime destruction on Windows. There will be an cudaErrorCudartUnloading +// error. However, it is safe to ignore this error because the cuda runtime is already exited, we are no more worried +// about the memory leaks. +#define TLLM_CUDA_CHECK_FREE_RESOURCE(stat) \ + do \ + { \ + tensorrt_llm::common::checkEx((stat), {cudaSuccess, cudaErrorCudartUnloading}, #stat, __FILE__, __LINE__); \ + } while (0) diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h deleted file mode 100644 index d7bf43b40..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace tensorrt_llm::utils::customAllReduceUtils -{ - -constexpr size_t NUM_POINTERS_PER_RANK = 7; - -// WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py -inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept -{ - if (worldSize <= 2) - { - return 16 * 1000 * 1000; - } - return 8 * 1000 * 1000; -} - -} // namespace tensorrt_llm::utils::customAllReduceUtils diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp deleted file mode 100644 index 64d3d44ac..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp +++ /dev/null @@ -1,214 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "envUtils.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/logger.h" -#include - -namespace tensorrt_llm::common -{ - -std::optional getIntEnv(char const* name) -{ - char const* const env = std::getenv(name); - if (env == nullptr) - { - return std::nullopt; - } - int32_t const val = std::stoi(env); - if (val <= 0) - { - return std::nullopt; - } - return {val}; -}; - -// Returns true if the env variable exists and is set to "1" -static bool getBoolEnv(char const* name) -{ - char const* env = std::getenv(name); - return env && env[0] == '1' && env[1] == '\0'; -} - -// XQA kernels (optimized kernels for generation phase). -bool forceXQAKernels() -{ - static bool const forceXQA = (getIntEnv("TRTLLM_FORCE_XQA").value_or(0) != 0); - return forceXQA; -} - -std::optional getEnvEnableXQAJIT() -{ - static bool init = false; - static bool exists = false; - static bool enableXQAJIT = false; - if (!init) - { - init = true; - char const* enable_xqa_jit_var = std::getenv("TRTLLM_ENABLE_XQA_JIT"); - if (enable_xqa_jit_var) - { - exists = true; - if (enable_xqa_jit_var[0] == '1' && enable_xqa_jit_var[1] == '\0') - { - enableXQAJIT = true; - } - } - } - if (exists) - { - return enableXQAJIT; - } - else - { - return std::nullopt; - } -} - -// Tune the number of blocks per sequence for accuracy/performance purpose. -bool getEnvMmhaMultiblockDebug() -{ - static bool init = false; - static bool forceMmhaMaxSeqLenTile = false; - if (!init) - { - init = true; - char const* enable_mmha_debug_var = std::getenv("TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG"); - if (enable_mmha_debug_var) - { - if (enable_mmha_debug_var[0] == '1' && enable_mmha_debug_var[1] == '\0') - { - forceMmhaMaxSeqLenTile = true; - } - } - } - return forceMmhaMaxSeqLenTile; -} - -int getEnvMmhaBlocksPerSequence() -{ - static bool init = false; - static int mmhaBlocksPerSequence = 0; - if (!init) - { - init = true; - char const* mmhaBlocksPerSequenceEnv = std::getenv("TRTLLM_MMHA_BLOCKS_PER_SEQUENCE"); - if (mmhaBlocksPerSequenceEnv) - { - mmhaBlocksPerSequence = std::atoi(mmhaBlocksPerSequenceEnv); - if (mmhaBlocksPerSequence <= 0) - { - TLLM_LOG_WARNING("Invalid value for TRTLLM_MMHA_BLOCKS_PER_SEQUENCE. Will use default values instead!"); - } - } - } - return mmhaBlocksPerSequence; -} - -int getEnvMmhaKernelBlockSize() -{ - static bool init = false; - static int mmhaKernelBlockSize = 0; - if (!init) - { - init = true; - char const* mmhaKernelBlockSizeEnv = std::getenv("TRTLLM_MMHA_KERNEL_BLOCK_SIZE"); - if (mmhaKernelBlockSizeEnv) - { - mmhaKernelBlockSize = std::atoi(mmhaKernelBlockSizeEnv); - if (mmhaKernelBlockSize <= 0) - { - TLLM_LOG_WARNING("Invalid value for TRTLLM_MMHA_KERNEL_BLOCK_SIZE. Will use default values instead!"); - } - } - } - return mmhaKernelBlockSize; -} - -bool getEnvEnablePDL() -{ - static bool init = false; - static bool enablePDL = false; - if (!init) - { - init = true; - // PDL only available when arch >= 90 - if (getSMVersion() >= 90) - { - // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1` - enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL"); - } - } - return enablePDL; -} - -bool getEnvUseUCXKvCache() -{ - static bool const useUCXKVCache = getBoolEnv("TRTLLM_USE_UCX_KVCACHE"); - return useUCXKVCache; -} - -std::string getEnvUCXInterface() -{ - static bool init = false; - static std::string ucxInterface; - if (!init) - { - init = true; - { - char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE"); - if (ucx_interface) - { - ucxInterface = ucx_interface; - } - } - } - return ucxInterface; -} - -bool getEnvDisaggLayerwise() -{ - static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE"); - return disaggLayerwise; -} - -bool getEnvParallelCacheSend() -{ - static bool const parallelCacheSend = getBoolEnv("TRTLLM_PARALLEL_CACHE_SEND"); - return parallelCacheSend; -} - -bool getEnvRequestKVCacheSerial() -{ - static bool const requestKVCacheSerial = getBoolEnv("TRTLLM_REQUEST_KV_CACHE_SERIAL"); - return requestKVCacheSerial; -} - -bool getEnvDisableKVCacheTransferOverlap() -{ - static bool const disableKVCacheTransferOverlap = getBoolEnv("TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP"); - return disableKVCacheTransferOverlap; -} - -bool getEnvDisableReceiveKVCacheParallel() -{ - static bool const disableReceiveParallel = getBoolEnv("TRTLLM_DISABLE_KVCACHE_RECEIVE_PARALLEL"); - return disableReceiveParallel; -} - -} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h deleted file mode 100644 index 027c7cfbb..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include -#include - -namespace tensorrt_llm::common -{ -// Useful when you want to inject some debug code controllable with env var. -std::optional getIntEnv(char const* name); - -// XQA kernels (optimized kernels for generation phase). -bool forceXQAKernels(); - -// Whether XQA JIT is enabled. -// -// Returns the value of TRTLLM_ENABLE_XQA_JIT env var. If such env var doesn't exist, std::nullopt is returned. -std::optional getEnvEnableXQAJIT(); - -// Tune the number of blocks per sequence for accuracy/performance purpose. -bool getEnvMmhaMultiblockDebug(); - -int getEnvMmhaBlocksPerSequence(); - -int getEnvMmhaKernelBlockSize(); - -// Whether PDL is enabled. -bool getEnvEnablePDL(); - -bool getEnvUseUCXKvCache(); - -std::string getEnvUCXInterface(); - -bool getEnvDisaggLayerwise(); - -bool getEnvParallelCacheSend(); - -bool getEnvRequestKVCacheSerial(); - -bool getEnvDisableKVCacheTransferOverlap(); - -bool getEnvDisableReceiveKVCacheParallel(); - -} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/logger.h b/sgl-kernel/3rdparty/tensorrt_llm/common/logger.h new file mode 100644 index 000000000..df84e2263 --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/logger.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/stringUtils.h" + +namespace tensorrt_llm::common +{ + +class Logger +{ + +// On Windows, the file wingdi.h is included which has +// #define ERROR 0 +// This breaks everywhere ERROR is used in the Level enum +#ifdef _WIN32 +#undef ERROR +#endif // _WIN32 + +public: + enum Level + { + TRACE = 0, + DEBUG = 10, + INFO = 20, + WARNING = 30, + ERROR = 40 + }; + + static Logger* getLogger(); + + Logger(Logger const&) = delete; + void operator=(Logger const&) = delete; + +#if defined(_MSC_VER) + template + void log(Level level, char const* format, Args const&... args); + + template + void log(Level level, int rank, char const* format, Args const&... args); +#else + template + void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0))); + + template + void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0))); +#endif + + template + void log(Level level, std::string const& format, Args const&... args) + { + return log(level, format.c_str(), args...); + } + + template + void log(Level const level, int const rank, std::string const& format, Args const&... args) + { + return log(level, rank, format.c_str(), args...); + } + + void log(std::exception const& ex, Level level = Level::ERROR); + + Level getLevel() const + { + return level_; + } + + void setLevel(Level const level) + { + level_ = level; + log(INFO, "Set logger level to %s", getLevelName(level)); + } + + bool isEnabled(Level const level) const + { + return level_ <= level; + } + +private: + static auto constexpr kPREFIX = "[TensorRT-LLM]"; + +#ifndef NDEBUG + Level const DEFAULT_LOG_LEVEL = DEBUG; +#else + Level const DEFAULT_LOG_LEVEL = INFO; +#endif + Level level_ = DEFAULT_LOG_LEVEL; + + Logger(); // NOLINT(modernize-use-equals-delete) + + static inline char const* getLevelName(Level const level) + { + switch (level) + { + case TRACE: return "TRACE"; + case DEBUG: return "DEBUG"; + case INFO: return "INFO"; + case WARNING: return "WARNING"; + case ERROR: return "ERROR"; + } + + TLLM_THROW("Unknown log level: %d", level); + } + + static inline std::string getPrefix(Level const level) + { + return fmtstr("%s[%s] ", kPREFIX, getLevelName(level)); + } + + static inline std::string getPrefix(Level const level, int const rank) + { + return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank); + } +}; + +template +void Logger::log(Logger::Level level, char const* format, Args const&... args) +{ + if (isEnabled(level)) + { + auto const fmt = getPrefix(level) + format; + auto& out = level_ < WARNING ? std::cout : std::cerr; + if constexpr (sizeof...(args) > 0) + { + out << fmtstr(fmt.c_str(), args...); + } + else + { + out << fmt; + } + out << std::endl; + } +} + +template +void Logger::log(Logger::Level const level, int const rank, char const* format, Args const&... args) +{ + if (isEnabled(level)) + { + auto const fmt = getPrefix(level, rank) + format; + auto& out = level_ < WARNING ? std::cout : std::cerr; + if constexpr (sizeof...(args) > 0) + { + out << fmtstr(fmt.c_str(), args...); + } + else + { + out << fmt; + } + out << std::endl; + } +} + +#define TLLM_LOG(level, ...) \ + do \ + { \ + auto* const logger = tensorrt_llm::common::Logger::getLogger(); \ + if (logger->isEnabled(level)) \ + { \ + logger->log(level, __VA_ARGS__); \ + } \ + } while (0) + +#define TLLM_LOG_TRACE(...) TLLM_LOG(tensorrt_llm::common::Logger::TRACE, __VA_ARGS__) +#define TLLM_LOG_DEBUG(...) TLLM_LOG(tensorrt_llm::common::Logger::DEBUG, __VA_ARGS__) +#define TLLM_LOG_INFO(...) TLLM_LOG(tensorrt_llm::common::Logger::INFO, __VA_ARGS__) +#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__) +#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__) +#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__) +} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h deleted file mode 100644 index 1bad3a2c1..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace tensorrt_llm -{ -namespace common -{ - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -inline __device__ __host__ T divUp(T m, T n) -{ - return (m + n - 1) / n; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace common -} // namespace tensorrt_llm diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu b/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu deleted file mode 100644 index d13217b20..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu +++ /dev/null @@ -1,906 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/cudaTypeUtils.cuh" -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/common/memoryUtils.h" - -#include -#include -#include - -namespace tensorrt_llm -{ -namespace common -{ - -template -void deviceMalloc(T** ptr, size_t size, bool is_random_initialize) -{ - check_cuda_error(cudaMalloc((void**) (ptr), sizeof(T) * size)); - if (is_random_initialize) - { - cudaRandomUniform(*ptr, size); - } -} - -template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize); -#ifdef ENABLE_BF16 -template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize); -#endif -template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize); -#ifdef ENABLE_FP8 -template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize); -#endif - -template -void deviceMemSetZero(T* ptr, size_t size) -{ - check_cuda_error(cudaMemset(static_cast(ptr), 0, sizeof(T) * size)); -} - -template void deviceMemSetZero(float* ptr, size_t size); -template void deviceMemSetZero(half* ptr, size_t size); -template void deviceMemSetZero(int* ptr, size_t size); -template void deviceMemSetZero(uint32_t* ptr, size_t size); -template void deviceMemSetZero(bool* ptr, size_t size); -#ifdef ENABLE_FP8 -template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size); -#endif -#ifdef ENABLE_BF16 -template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size); -#endif - -template -void deviceFree(T*& ptr) -{ - if (ptr != NULL) - { - check_cuda_error(cudaFree(ptr)); - ptr = NULL; - } -} - -template void deviceFree(float*& ptr); -template void deviceFree(half*& ptr); -#ifdef ENABLE_BF16 -template void deviceFree(__nv_bfloat16*& ptr); -#endif -template void deviceFree(unsigned short*& ptr); -template void deviceFree(int*& ptr); -template void deviceFree(bool*& ptr); -template void deviceFree(char*& ptr); -template void deviceFree(int8_t*& ptr); -#ifdef ENABLE_FP8 -template void deviceFree(__nv_fp8_e4m3*& ptr); -#endif - -template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream) -{ - T* arr = new T[size]; - std::fill(arr, arr + size, value); - check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream)); - delete[] arr; -} - -template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream); -template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream); -#endif -template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream); -template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream); - -template -void cudaD2Hcpy(T* tgt, T const* src, const size_t size) -{ - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost)); -} - -template void cudaD2Hcpy(float* tgt, float const* src, size_t size); -template void cudaD2Hcpy(half* tgt, half const* src, size_t size); -#ifdef ENABLE_BF16 -template void cudaD2Hcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size); -#endif -template void cudaD2Hcpy(int* tgt, int const* src, size_t size); -template void cudaD2Hcpy(bool* tgt, bool const* src, size_t size); -#ifdef ENABLE_FP8 -template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size); -#endif -template void cudaD2Hcpy(unsigned long long* tgt, unsigned long long const* src, size_t size); -template void cudaD2Hcpy(unsigned int* tgt, unsigned int const* src, size_t size); -template void cudaD2Hcpy(int8_t* tgt, int8_t const* src, size_t size); - -template -void cudaH2Dcpy(T* tgt, T const* src, const size_t size) -{ - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice)); -} - -template void cudaH2Dcpy(float* tgt, float const* src, size_t size); -template void cudaH2Dcpy(half* tgt, half const* src, size_t size); -#ifdef ENABLE_BF16 -template void cudaH2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size); -#endif -template void cudaH2Dcpy(int* tgt, int const* src, size_t size); -template void cudaH2Dcpy(bool* tgt, bool const* src, size_t size); -#ifdef ENABLE_FP8 -template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size); -#endif -template void cudaH2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size); -template void cudaH2Dcpy(unsigned int* tgt, unsigned int const* src, size_t size); -template void cudaH2Dcpy(int8_t* tgt, int8_t const* src, size_t size); - -template -void cudaD2Dcpy(T* tgt, T const* src, const size_t size, cudaStream_t stream) -{ - check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice, stream)); -} - -template void cudaD2Dcpy(float* tgt, float const* src, size_t size, cudaStream_t stream); -template void cudaD2Dcpy(half* tgt, half const* src, size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void cudaD2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream); -#endif -template void cudaD2Dcpy(int* tgt, int const* src, size_t size, cudaStream_t stream); -template void cudaD2Dcpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream); -template void cudaD2Dcpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream); -#ifdef ENABLE_FP8 -template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size, cudaStream_t stream); -#endif -template void cudaD2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream); - -template -__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = (T_OUT) ((float) (src[tid])); - } -} - -template -void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream) -{ - cudaCast<<<256, 256, 0, stream>>>(dst, src, size); -} - -template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast( - __nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast( - __nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream); -#endif - -template -void cudaAutoCpy(T* tgt, T const* src, const size_t size, cudaStream_t stream) -{ - if (stream != NULL) - { - check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream)); - } - else - { - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault)); - } -} - -template void cudaAutoCpy(float* tgt, float const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(half* tgt, half const* src, size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void cudaAutoCpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream); -#endif -template void cudaAutoCpy(int* tgt, int const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(uint8_t* tgt, uint8_t const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(uint32_t* tgt, uint32_t const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(unsigned long* tgt, unsigned long const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(char* tgt, char const* src, size_t size, cudaStream_t stream); - -template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream); -#endif -template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy( - unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream); - -template -__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, int const seq_offset) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((unsigned long long int) 1337, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) - { - buffer[index] = (T) (curand_uniform(&local_state) * 0.2f - 0.1f); - } -} - -template <> -__global__ void cuda_random_uniform_kernel(int* buffer, const size_t size, int const seq_offset) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((float) 1337.f, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) - { - buffer[index] = curand(&local_state); - } -} - -template <> -__global__ void cuda_random_uniform_kernel(bool* buffer, const size_t size, int const seq_offset) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((float) 1337.f, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) - { - buffer[index] = (curand(&local_state) % 2 == 0); - } -} - -template <> -__global__ void cuda_random_uniform_kernel(char* buffer, const size_t size, int const seq_offset) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((float) 1337.f, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) - { - buffer[index] = curand(&local_state) % 0xFF; - } -} - -template -void cudaRandomUniform(T* buffer, const size_t size) -{ - static int seq_offset = 0; - cuda_random_uniform_kernel<<<256, 256>>>(buffer, size, seq_offset); - seq_offset += 256 * 256; -} - -template void cudaRandomUniform(float* buffer, const size_t size); -template void cudaRandomUniform(half* buffer, const size_t size); -#ifdef ENABLE_BF16 -template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size); -#endif -template void cudaRandomUniform(int* buffer, const size_t size); -template void cudaRandomUniform(bool* buffer, const size_t size); -template void cudaRandomUniform(char* buffer, const size_t size); -#ifdef ENABLE_FP8 -template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size); -#endif - -// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or -// the product of the elements in shape is 0, this function will return an empty vector. -template -std::vector loadWeightFromBinHelper(std::vector shape, std::string filename) -{ - if (shape.size() > 2) - { - printf("[ERROR] shape should have less than two dims \n"); - return std::vector(); - } - size_t dim0 = shape[0], dim1 = 1; - if (shape.size() == 2) - { - dim1 = shape[1]; - } - size_t size = dim0 * dim1; - if (size == 0) - { - TLLM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str()); - return std::vector(); - } - - std::vector host_array(size); - std::ifstream in(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) - { - TLLM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str()); - return std::vector(); - } - - size_t loaded_data_size = sizeof(T) * size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - - TLLM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename); - in.read((char*) host_array.data(), loaded_data_size); - - size_t in_get_size = in.gcount(); - if (in_get_size != loaded_data_size) - { - TLLM_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n", filename.c_str(), - in_get_size, loaded_data_size); - return std::vector(); - } - in.close(); - // If we succeed, return an array with values. - return host_array; -} - -template -int loadWeightFromBinFunc(T* ptr, std::vector shape, std::string filename) -{ - std::vector host_array = loadWeightFromBinHelper(shape, filename); - - if (host_array.empty()) - { - return 0; - } - - if (std::is_same::value == true) - { - cudaH2Dcpy(ptr, (T*) host_array.data(), host_array.size()); - } - else - { - T_IN* ptr_2 = nullptr; - deviceMalloc(&ptr_2, host_array.size(), false); - cudaH2Dcpy(ptr_2, host_array.data(), host_array.size()); - invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size()); - deviceFree(ptr_2); - } - return 0; -} - -template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(int8_t* ptr, std::vector shape, std::string filename); -#ifdef ENABLE_BF16 -template int loadWeightFromBinFunc<__nv_bfloat16, float>( - __nv_bfloat16* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc<__nv_bfloat16, half>( - __nv_bfloat16* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>( - __nv_bfloat16* ptr, std::vector shape, std::string filename); -#endif // ENABLE_BF16 -template int loadWeightFromBinFunc(int* ptr, std::vector shape, std::string filename); -#ifdef ENABLE_FP8 -template int loadWeightFromBinFunc<__nv_fp8_e4m3, float>( - __nv_fp8_e4m3* ptr, std::vector shape, std::string filename); -#endif // ENABLE_FP8 - -template -int loadWeightFromBin(T* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type) -{ - switch (model_file_type) - { - case TRTLLMCudaDataType::FP32: loadWeightFromBinFunc(ptr, shape, filename); break; - case TRTLLMCudaDataType::FP16: loadWeightFromBinFunc(ptr, shape, filename); break; - case TRTLLMCudaDataType::INT8: loadWeightFromBinFunc(ptr, shape, filename); break; -#ifdef ENABLE_BF16 - case TRTLLMCudaDataType::BF16: loadWeightFromBinFunc(ptr, shape, filename); break; -#endif -#ifdef ENABLE_FP8 - case TRTLLMCudaDataType::FP8: loadWeightFromBinFunc(ptr, shape, filename); break; -#endif - default: TLLM_LOG_ERROR("Does not support TRTLLMCudaDataType=%d", model_file_type); TLLM_CHECK(false); - } - return 0; -} - -template <> -int loadWeightFromBin(int* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type) -{ - loadWeightFromBinFunc(ptr, shape, filename); - return 0; -} - -template int loadWeightFromBin( - float* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); -template int loadWeightFromBin( - half* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); -template int loadWeightFromBin( - int8_t* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); -#ifdef ENABLE_BF16 -template int loadWeightFromBin( - __nv_bfloat16* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); -#endif -#ifdef ENABLE_FP8 -template int loadWeightFromBin( - __nv_fp8_e4m3* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); -#endif -template int loadWeightFromBin( - int* ptr, std::vector shape, std::string filename, TRTLLMCudaDataType model_file_type); - -template -__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = cuda_cast(src[tid]); - } -} - -template -void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size); -} - -template void invokeCudaD2DcpyConvert(int8_t* tgt, float const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, int8_t const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, int const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(half* tgt, int const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, float const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(half* tgt, float const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, half const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(uint32_t* tgt, int const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, uint32_t const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, float const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, half const* src, const size_t size, cudaStream_t stream); - -#ifdef ENABLE_BF16 -template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, float const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, int const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream); -#endif // ENABLE_BF16 - -template -__global__ void cudaD2DScaleCpyConvert( - T_OUT* dst, const T_IN* src, float const* scale, bool invert_scale, const size_t size) -{ - float const scale_value = invert_scale ? 1.0f / scale[0] : scale[0]; - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = cuda_cast(cuda_cast(src[tid]) * scale_value); - } -} - -template -void invokeCudaD2DScaleCpyConvert( - T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, const size_t size, cudaStream_t stream) -{ - cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size); -} - -// clang-format off -template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#endif // ENABLE_BF16 -#ifdef ENABLE_FP8 -template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#endif // ENABLE_FP8 -// clang-format on - -void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream) -{ - invokeCudaD2DcpyConvert(dst, src, size, stream); -} - -void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream) -{ - invokeCudaD2DcpyConvert(dst, src, size, stream); -} - -template -void saveToBinary(T const* ptr, const size_t size, std::string filename) -{ - - std::vector h_ptr(size); - cudaD2Hcpy(h_ptr.data(), ptr, size); - std::vector float_ptr(size); - for (size_t i = 0; i < size; i++) - { - float_ptr[i] = (float) h_ptr[i]; - } - - std::ofstream out(filename, std::ios::out | std::ios::binary); - TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); - - out.write((char*) float_ptr.data(), size * sizeof(float)); -} - -template void saveToBinary(float const* ptr, const size_t size, std::string filename); -template void saveToBinary(half const* ptr, const size_t size, std::string filename); -#ifdef ENABLE_BF16 -template void saveToBinary(__nv_bfloat16 const* ptr, const size_t size, std::string filename); -#endif // ENABLE_BF16 - -template <> -void saveToBinary(int const* ptr, const size_t size, std::string filename) -{ - std::vector h_ptr(size); - cudaD2Hcpy(h_ptr.data(), ptr, size); - std::ofstream out(filename, std::ios::out | std::ios::binary); - TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); - out.write((char*) h_ptr.data(), size * sizeof(int)); -} - -template -__global__ void fakeCast(T_IN* input_ptr, const size_t size) -{ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) - { - T_fake_type tmp_val = (T_fake_type) ((float) input_ptr[i]); - input_ptr[i] = (T_IN) ((float) tmp_val); - } -} - -template -void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream) -{ - dim3 block(256); - dim3 grid((size + 255) / 256); - fakeCast<<>>(input_ptr, size); -} - -#ifdef ENABLE_FP8 -__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = (float) (src[tid]); - } -} - -void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) -{ - cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = (half) ((float) (src[tid])); - } -} - -void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) -{ - cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = (__nv_fp8_e4m3) src[tid]; - } -} - -void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = (__nv_fp8_e4m3) src[tid]; - } -} - -void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) - { - dst[tid] = (__nv_fp8_e4m3) src[tid]; - } -} - -void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -#endif // ENABLE_FP8 - -template -__global__ void transpose(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) - { - const size_t src_col_id = tid % dim1; - const size_t src_row_id = tid / dim1; - dst[src_col_id * dim0 + src_row_id] = (T_OUT) (src[tid]); - } -} - -template -void invokeInPlaceTranspose(T* data, T* workspace, const size_t dim0, const size_t dim1) -{ - // copy data to workspace, and then transpose from workspace to data - cudaD2Dcpy(workspace, data, dim0 * dim1); - transpose<<<256, 256>>>(data, workspace, dim0, dim1); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose( - __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose( - __nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose(float* data, float* workspace, const size_t dim0, const size_t dim1); - -template -__global__ void transpose0213( - T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3) -{ - // src permutation: [0, 1, 2, 3] - // dst permutation: [0, 2, 1, 3] - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3; - tid += blockDim.x * gridDim.x) - { - size_t tmp_idx = tid; - const size_t dim_3_idx = tmp_idx % dim3; - tmp_idx = (tmp_idx - dim_3_idx) / dim3; - const size_t dim_2_idx = tmp_idx % dim2; - tmp_idx = (tmp_idx - dim_2_idx) / dim2; - const size_t dim_1_idx = tmp_idx % dim1; - tmp_idx = (tmp_idx - dim_1_idx) / dim1; - const size_t dim_0_idx = tmp_idx % dim0; - dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid]; - } -} - -template -void invokeInPlaceTranspose0213( - T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3) -{ - // copy data to workspace, and then transpose from workspace to data - // Note that this kernel is used for pre-processing and not very efficient. - cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3); - transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose0213(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, - const size_t dim1, const size_t dim2, const size_t dim3); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose0213(__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, - const size_t dim1, const size_t dim2, const size_t dim3); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose0213( - float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3); - -template -__global__ void transpose102(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2) -{ - // src permutation: [0, 1, 2] - // dst permutation: [1, 0, 2] - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) - { - size_t tmp_idx = tid; - const size_t dim_2_idx = tmp_idx % dim2; - tmp_idx = (tmp_idx - dim_2_idx) / dim2; - const size_t dim_1_idx = tmp_idx % dim1; - tmp_idx = (tmp_idx - dim_1_idx) / dim1; - const size_t dim_0_idx = tmp_idx % dim0; - dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid]; - } -} - -template -void invokeInPlaceTranspose102(T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2) -{ - // copy data to workspace, and then transpose from workspace to data - // Note that this kernel is used for pre-processing and not very efficient. - cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2); - transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose102( - __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1, const size_t dim2); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose102( - __nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1, const size_t dim2); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose102( - float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2); - -template -void __global__ multiplyScale(T* tensor, float scale, const size_t size) -{ - for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) - { - tensor[index] = (T) (((float) tensor[index]) * scale); - } -} - -template -void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream) -{ - int block = 256; - int grid = (size + 255) / 256; - multiplyScale<<>>(tensor, scale, size); -} - -template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream); -template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); -#endif - -template -void __global__ divideScale(T* tensor, float scale, const size_t size) -{ - for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) - { - tensor[index] = (T) (((float) tensor[index]) / scale); - } -} - -template -void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream) -{ - int block = 256; - int grid = (size + 255) / 256; - divideScale<<>>(tensor, scale, size); -} - -template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream); -template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_BF16 -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast<__nv_bfloat16, __nv_bfloat16>( - __nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); -#endif -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -#ifdef ENABLE_FP8 -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>( - __nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); -#endif - -size_t cuda_datatype_size(TRTLLMCudaDataType dt) -{ - static const std::unordered_map sizes{ - {TRTLLMCudaDataType::FP32, sizeof(float)}, {TRTLLMCudaDataType::FP16, sizeof(half)} -#ifdef ENABLE_BF16 - , - {TRTLLMCudaDataType::BF16, sizeof(__nv_bfloat16)} -#endif - }; - - return sizes.at(dt); -} - -template -__global__ void check_range(T const* buffer, size_t size, T min, T max, bool* d_within_range) -{ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) - { - const T val = buffer[i]; - if (val < min || val > max) - { - *d_within_range = false; - } - } -} - -template -bool invokeCheckRange(T const* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream) -{ - cudaMemsetAsync(d_within_range, true, sizeof(bool), stream); - - dim3 block(256); - dim3 grid((size + 255) / 256); - check_range<<>>(buffer, size, min, max, d_within_range); - - bool result; - cudaD2Hcpy(&result, d_within_range, 1); - return result; -} - -template bool invokeCheckRange( - int const* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream); - -/* - * Determine the total workspace size based on a vector containing multiple variable sizes. - */ -size_t calcAlignedSize(std::vector const& sizes, const size_t ALIGN_BYTES) -{ - const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1); - // Check ALIGN_BYTES is a power of 2 - assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0); - - size_t total = 0; - for (auto sz : sizes) - { - total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK; - } - - // We add extra "ALIGN_BYTES - 1" bytes in case the start address passed to the function calcAlignedPointers() is - // not aligned. - return total + ALIGN_BYTES - 1; -} - -/* - * Given the address of the workspace and the vector containing multiple variable sizes, calculate the start addresses - * of each variable. - */ -void calcAlignedPointers( - std::vector& outPtrs, void const* p, std::vector const& sizes, size_t ALIGN_BYTES) -{ - const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1); - // Check ALIGN_BYTES is a power of 2 - assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0); - - // In case the start address is not aligned - char* ptr = reinterpret_cast((reinterpret_cast(p) + ALIGN_BYTES - 1) & ALIGN_MASK); - - outPtrs.reserve(sizes.size()); - for (auto sz : sizes) - { - outPtrs.push_back(ptr); - ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK; - } -} - -} // namespace common -} // namespace tensorrt_llm diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h deleted file mode 100644 index 9e413a1be..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/common/cudaFp8Utils.h" -#include "tensorrt_llm/common/cudaUtils.h" - -#include - -namespace tensorrt_llm -{ -namespace common -{ - -template -void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true); - -template -void deviceMemSetZero(T* ptr, size_t size); - -template - -void deviceFree(T*& ptr); - -template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0); - -template -void cudaD2Hcpy(T* tgt, T const* src, size_t const size); - -template -void cudaH2Dcpy(T* tgt, T const* src, size_t const size); - -template -void cudaD2Dcpy(T* tgt, T const* src, size_t const size, cudaStream_t stream = NULL); - -template -void cudaAutoCpy(T* tgt, T const* src, size_t const size, cudaStream_t stream = NULL); - -template -void cudaRandomUniform(T* buffer, size_t const size); - -template -int loadWeightFromBin(T* ptr, std::vector shape, std::string filename, - TRTLLMCudaDataType model_file_type = TRTLLMCudaDataType::FP32); - -// template -// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t* quantized_weight_ptr, -// T* scale_ptr, -// std::vector shape, -// std::string filename, -// TRTLLMCudaDataType model_file_type = TRTLLMCudaDataType::FP32); - -void invokeCudaD2DcpyHalf2Float(float* dst, half* src, size_t const size, cudaStream_t stream); -void invokeCudaD2DcpyFloat2Half(half* dst, float* src, size_t const size, cudaStream_t stream); -#ifdef ENABLE_FP8 -void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, size_t const size, cudaStream_t stream); -void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, size_t const size, cudaStream_t stream); -void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, size_t const size, cudaStream_t stream); -void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, size_t const size, cudaStream_t stream); -void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, size_t const size, cudaStream_t stream); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, size_t const size, cudaStream_t stream); -#endif // ENABLE_BF16 - -template -void invokeCudaCast(T_OUT* dst, T_IN const* const src, size_t const size, cudaStream_t stream); - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -// The following functions implement conversion of multi-dimensional indices to an index in a flat array. -// The shape of the Tensor dimensions is passed as one array (`dims`), the indices are given as individual arguments. -// For examples on how to use these functions, see their tests `test_memory_utils.cu`. -// All of these functions can be evaluated at compile time by recursive template expansion. - -template -__inline__ __host__ __device__ std::enable_if_t::value, T> constexpr flat_index( - T const& acc, TDim dims, TIndex const& index) -{ - assert(index < dims[0]); - return acc * dims[0] + index; -} - -template -__inline__ __host__ __device__ std::enable_if_t::value, T> constexpr flat_index( - T const& acc, TDim dims, TIndex const& index, TIndices... indices) -{ - assert(index < dims[0]); - return flat_index(acc * dims[0] + index, dims + 1, indices...); -} - -template -__inline__ __host__ __device__ std::enable_if_t::value, T> constexpr flat_index( - [[maybe_unused]] TDim dims, T const& index) -{ - assert(index < dims[0]); - return index; -} - -template -__inline__ __host__ __device__ - std::enable_if_t::value, typename std::remove_pointer::type> constexpr flat_index( - TDim dims, TIndex const& index, TIndices... indices) -{ - assert(index < dims[0]); - return flat_index(static_cast::type>(index), dims + 1, indices...); -} - -template -__inline__ __host__ __device__ T constexpr flat_index( - std::array const& dims, TIndex const& index, TIndices... indices) -{ - static_assert(skip < N); - static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions"); - return flat_index(&dims[skip], index, indices...); -} - -template -__inline__ __host__ __device__ T constexpr flat_index( - T const& acc, std::array const& dims, TIndex const& index, TIndices... indices) -{ - static_assert(skip < N); - static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions"); - return flat_index(acc, &dims[skip], index, indices...); -} - -template -__inline__ __host__ __device__ T constexpr flat_index(T const (&dims)[N], TIndex const& index, TIndices... indices) -{ - static_assert(skip < N); - static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions"); - return flat_index(static_cast(dims) + skip, index, indices...); -} - -template -__inline__ __host__ __device__ T constexpr flat_index( - T const& acc, T const (&dims)[N], TIndex const& index, TIndices... indices) -{ - static_assert(skip < N); - static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions"); - return flat_index(acc, static_cast(dims) + skip, index, indices...); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -// These are simpler functions for multi-dimensional index conversion. Indices and dimensions are passed as individual -// arguments. These functions are more suitable for usage inside kernels than the corresponding flat_index functions -// which require arrays as arguments. Usage examples can be found in `test_memory_utils.cu`. The functions can be -// evaluated at compile time. - -template -__inline__ __host__ __device__ T constexpr flat_index2(TIndex const& index_0, TIndex const& index_1, T const& dim_1) -{ - assert(index_1 < dim_1); - return index_0 * dim_1 + index_1; -} - -template -__inline__ __host__ __device__ T constexpr flat_index3( - TIndex const& index_0, TIndex const& index_1, TIndex const& index_2, T const& dim_1, T const& dim_2) -{ - assert(index_2 < dim_2); - return flat_index2(index_0, index_1, dim_1) * dim_2 + index_2; -} - -template -__inline__ __host__ __device__ T constexpr flat_index4(TIndex const& index_0, TIndex const& index_1, - TIndex const& index_2, TIndex const& index_3, T const& dim_1, T const& dim_2, T const& dim_3) -{ - assert(index_3 < dim_3); - return flat_index3(index_0, index_1, index_2, dim_1, dim_2) * dim_3 + index_3; -} - -template -__inline__ __host__ __device__ T constexpr flat_index5(TIndex const& index_0, TIndex const& index_1, - TIndex const& index_2, TIndex const& index_3, TIndex const& index_4, T const& dim_1, T const& dim_2, T const& dim_3, - T const& dim_4) -{ - assert(index_4 < dim_4); - return flat_index4(index_0, index_1, index_2, index_3, dim_1, dim_2, dim_3) * dim_4 + index_4; -} - -template -__inline__ __host__ __device__ T constexpr flat_index_strided3( - TIndex const& index_0, TIndex const& index_1, TIndex const& index_2, T const& stride_1, T const& stride_2) -{ - assert(index_1 < stride_1 / stride_2); - assert(index_2 < stride_2); - return index_0 * stride_1 + index_1 * stride_2 + index_2; -} - -template -__inline__ __host__ __device__ T constexpr flat_index_strided4(TIndex const& index_0, TIndex const& index_1, - TIndex const& index_2, TIndex const& index_3, T const& stride_1, T const& stride_2, T const& stride_3) -{ - assert(index_1 < stride_1 / stride_2); - assert(index_2 < stride_2 / stride_3); - assert(index_3 < stride_3); - return index_0 * stride_1 + index_1 * stride_2 + index_2 * stride_3 + index_3; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -void invokeInPlaceTranspose(T* data, T* workspace, size_t const dim0, size_t const dim1); - -template -void invokeInPlaceTranspose0213( - T* data, T* workspace, size_t const dim0, size_t const dim1, size_t const dim2, size_t const dim3); - -template -void invokeInPlaceTranspose102(T* data, T* workspace, size_t const dim0, size_t const dim1, size_t const dim2); - -template -void invokeMultiplyScale(T* tensor, float scale, size_t const size, cudaStream_t stream); - -template -void invokeDivideScale(T* tensor, float scale, size_t const size, cudaStream_t stream); - -template -void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, size_t const size, cudaStream_t stream = 0); - -template -void invokeCudaD2DScaleCpyConvert( - T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, size_t const size, cudaStream_t stream = 0); - -inline bool checkIfFileExist(std::string const& file_path) -{ - std::ifstream in(file_path, std::ios::in | std::ios::binary); - if (in.is_open()) - { - in.close(); - return true; - } - return false; -} - -template -void saveToBinary(T const* ptr, size_t const size, std::string filename); - -template -void invokeFakeCast(T_IN* input_ptr, size_t const size, cudaStream_t stream); - -size_t cuda_datatype_size(TRTLLMCudaDataType dt); - -template -bool invokeCheckRange(T const* buffer, size_t const size, T min, T max, bool* d_within_range, cudaStream_t stream); - -constexpr size_t DEFAULT_ALIGN_BYTES = 256; - -size_t calcAlignedSize(std::vector const& sizes, size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES); -void calcAlignedPointers(std::vector& outPtrs, void const* p, std::vector const& sizes, - size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES); - -struct AlignedPointersUnpacker -{ - template - void operator()(T*&... outPtrs) - { - assert(sizeof...(T) == alignedPointers.size()); - auto it = alignedPointers.begin(); - ((outPtrs = static_cast(*it++)), ...); - } - - std::vector alignedPointers; -}; - -AlignedPointersUnpacker inline calcAlignedPointers( - void const* p, std::vector const& sizes, size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES) -{ - AlignedPointersUnpacker unpacker{}; - calcAlignedPointers(unpacker.alignedPointers, p, sizes, ALIGN_BYTES); - return unpacker; -} - -} // namespace common -} // namespace tensorrt_llm diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp deleted file mode 100644 index dbdaca4ee..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp +++ /dev/null @@ -1,588 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include "tensorrt_llm/common/mpiUtils.h" - -#include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/runtime/common.h" -#include "tensorrt_llm/runtime/iBuffer.h" - -#include -#include -#include -#include -#include -#ifndef _WIN32 -#include -#endif - -// We rely on SizeType32 being int32_t in some places with weak type checking, -// i.e. we're passing void ptr to some function. To prevent mysterious errors -// in the future, we trigger a compilation error here if SizeType32 isn't int32_t. -static_assert(std::is_same::value); - -namespace tensorrt_llm::mpi -{ - -MPI_Datatype getMpiDtype(MpiType dtype) -{ -#if ENABLE_MULTI_DEVICE - static std::unordered_map const dtype_map{ - {MpiType::kBYTE, MPI_BYTE}, - {MpiType::kHALF, MPI_UINT16_T}, - {MpiType::kFLOAT, MPI_FLOAT}, - {MpiType::kDOUBLE, MPI_DOUBLE}, - {MpiType::kBOOL, MPI_C_BOOL}, - {MpiType::kINT8, MPI_INT8_T}, - {MpiType::kUINT8, MPI_UINT8_T}, - {MpiType::kINT32, MPI_INT32_T}, - {MpiType::kUINT32, MPI_UINT32_T}, - {MpiType::kINT64, MPI_INT64_T}, - {MpiType::kUINT64, MPI_UINT64_T}, - {MpiType::kFP8, MPI_UINT8_T}, - {MpiType::kBF16, MPI_UINT16_T}, - {MpiType::kCHAR, MPI_CHAR}, - }; - return dtype_map.at(dtype); -#else - TLLM_THROW("Multi device support is disabled."); -#endif -} - -MPI_Op getMpiOp(MpiOp op) -{ -#if ENABLE_MULTI_DEVICE - static std::unordered_map const op_map{ - {MpiOp::NULLOP, MPI_OP_NULL}, - {MpiOp::MAX, MPI_MAX}, - {MpiOp::MIN, MPI_MIN}, - {MpiOp::SUM, MPI_SUM}, - {MpiOp::PROD, MPI_PROD}, - {MpiOp::LAND, MPI_LAND}, - {MpiOp::BAND, MPI_BAND}, - {MpiOp::LOR, MPI_LOR}, - {MpiOp::BOR, MPI_BOR}, - {MpiOp::LXOR, MPI_LXOR}, - {MpiOp::BXOR, MPI_BXOR}, - {MpiOp::MINLOC, MPI_MINLOC}, - {MpiOp::MAXLOC, MPI_MAXLOC}, - {MpiOp::REPLACE, MPI_REPLACE}, - }; - return op_map.at(op); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -namespace -{ - -bool mpiInitialized = false; -std::recursive_mutex mpiMutex; - -MpiComm initLocalSession() -{ -#if ENABLE_MULTI_DEVICE - MPI_Comm localComm = nullptr; - MPI_Comm_split_type(COMM_SESSION, OMPI_COMM_TYPE_HOST, COMM_SESSION.getRank(), MPI_INFO_NULL, &localComm); - MpiComm localSession{localComm, false}; -#else - MpiComm localSession{COMM_SESSION, false}; -#endif // ENABLE_MULTI_DEVICE - return localSession; -} - -} // namespace - -std::vector getWorldRanks(MpiComm const& comm) -{ -#if ENABLE_MULTI_DEVICE - MPI_Group group = nullptr; - MPI_Group worldGroup = nullptr; - - MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup)); - MPICHECK(MPI_Comm_group(comm, &group)); - - int groupSize = 0; - MPICHECK(MPI_Group_size(group, &groupSize)); - std::vector ranks(groupSize); - std::vector worldRanks(groupSize); - std::iota(ranks.begin(), ranks.end(), 0); - - MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data())); - MPICHECK(MPI_Group_free(&group)); - MPICHECK(MPI_Group_free(&worldGroup)); -#else - std::vector worldRanks{0}; -#endif - return worldRanks; -} - -void initialize(MpiThreadSupport threadMode, bool forwardAbortToParent) -{ - // double-checked locking - if (mpiInitialized) - { - return; - } - std::lock_guard lk(mpiMutex); - if (mpiInitialized) - { - return; - } -#if ENABLE_MULTI_DEVICE - int initialized = 0; - TLLM_MPI_CHECK(MPI_Initialized(&initialized)); - if (!initialized) - { - TLLM_LOG_INFO("Initializing MPI with thread mode %d", threadMode); - int providedMode = 0; - auto requiredMode = static_cast(threadMode); - MPICHECK(MPI_Init_thread(nullptr, nullptr, requiredMode, &providedMode)); - TLLM_CHECK_WITH_INFO(providedMode >= requiredMode, "MPI_Init_thread failed"); - std::atexit([]() { MPI_Finalize(); }); - - /* - * We only catch SIGABRT and SIGSEGV because most, of not all errors in the worker will cause one of these 2 - * signals. Signals like SIGINT and SIGTERM should be issued to the parent and should terminate MPI workers - * correctly. - */ - for (int sig : {SIGABRT, SIGSEGV}) - { - __sighandler_t previousHandler = nullptr; - if (forwardAbortToParent) - { - previousHandler = std::signal(sig, - [](int signal) - { -#ifndef _WIN32 - pid_t parentProcessId = getppid(); - kill(parentProcessId, SIGKILL); -#endif - MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); - }); - } - else - { - previousHandler = std::signal(sig, [](int signal) { MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); }); - } - TLLM_CHECK_WITH_INFO(previousHandler != SIG_ERR, "Signal handler setup failed"); - } - - // ensure local MPI communicator is initialized - MpiComm::localSession(); - TLLM_LOG_INFO("Initialized MPI"); - } -#endif // ENABLE_MULTI_DEVICE - mpiInitialized = true; -} - -void MpiComm::barrier() const -{ -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Barrier(mComm)); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -#if ENABLE_MULTI_DEVICE -template >>> -size_t invokeChunked(TMpiFunc func, TBase* buffer, size_t size, MPI_Datatype dtype, TArgs... args) -{ - constexpr auto maxP1 = static_cast(std::numeric_limits::max()) + 1; - if (TLLM_LIKELY(size < maxP1)) - { - MPICHECK(func(buffer, size, dtype, args...)); - return 1; - } - - constexpr size_t alignment = 256; - int elementSize = 1; - MPICHECK(MPI_Type_size(dtype, &elementSize)); - elementSize = std::min(elementSize, alignment); - - // We cap at max alignment-bytes chunks that can be sent at once. - auto const step = maxP1 - (alignment / elementSize); - - using TCast = std::conditional_t, uint8_t const, uint8_t>; - size_t count = 0; - while (size != 0) - { - auto currentStep = static_cast(std::min(size, step)); - MPICHECK(func(buffer, currentStep, dtype, args...)); - size -= currentStep; - size_t diff = static_cast(currentStep) * elementSize; - buffer = static_cast(buffer) + diff; - ++count; - } - - return count; -} -#endif // ENABLE_MULTI_DEVICE - -std::shared_ptr MpiComm::bcastAsync(void* buffer, size_t size, MpiType dtype, int root) const -{ - std::shared_ptr r = std::make_shared(); -#if ENABLE_MULTI_DEVICE - invokeChunked(MPI_Ibcast, buffer, size, getMpiDtype(dtype), root, mComm, &r->mRequest); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE - return r; -} - -std::shared_ptr MpiComm::bcastAsync(runtime::IBuffer& buf, int root) const -{ - TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU); - return bcastAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root); -} - -void MpiComm::bcast(void* buffer, size_t size, MpiType dtype, int root) const -{ -#if ENABLE_MULTI_DEVICE - invokeChunked(MPI_Bcast, buffer, size, getMpiDtype(dtype), root, mComm); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -void MpiComm::bcast(runtime::IBuffer& buf, int root) const -{ - bcast(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root); -} - -std::shared_ptr MpiComm::sendAsync(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const -{ - TLLM_LOG_DEBUG("start MPI_Isend with size %d", size); - std::shared_ptr r = std::make_shared(); -#if ENABLE_MULTI_DEVICE - invokeChunked(MPI_Isend, buffer, size, getMpiDtype(dtype), dest, tag, mComm, &r->mRequest); -#else - TLLM_THROW("Multi device support is disabled."); -#endif - TLLM_LOG_DEBUG("end MPI_Isend with size %d", size); - return r; -} - -std::shared_ptr MpiComm::sendAsync(runtime::IBuffer const& buf, int dest, int tag) const -{ - return sendAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag); -} - -void MpiComm::send(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const -{ - TLLM_LOG_DEBUG("start MPI_Send with size %d", size); -#if ENABLE_MULTI_DEVICE - invokeChunked(MPI_Send, buffer, size, getMpiDtype(dtype), dest, tag, mComm); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE - TLLM_LOG_DEBUG("end MPI_Send with size %d", size); -} - -void MpiComm::send(runtime::IBuffer const& buf, int dest, int tag) const -{ - send(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag); -} - -MPI_Status MpiComm::recv(void* buffer, size_t size, MpiType dtype, int source, int tag) const -{ - TLLM_LOG_DEBUG("start MPI_Recv with size %d", size); - MPI_Status status{}; -#if ENABLE_MULTI_DEVICE - invokeChunked(MPI_Recv, buffer, size, getMpiDtype(dtype), source, tag, mComm, &status); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE - TLLM_LOG_DEBUG("end MPI_Recv with size %d", size); - return status; -} - -MPI_Status MpiComm::recv(runtime::IBuffer& buf, int source, int tag) const -{ - return recv(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, source, tag); -} - -MpiComm MpiComm::split(int color, int key) const -{ - MPI_Comm splitComm = nullptr; -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Comm_split(mComm, color, key, &splitComm)); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE - return MpiComm{splitComm, true}; -} - -void MpiComm::allreduce(void const* sendbuf, void* recvbuf, int count, MpiType dtype, MpiOp op) const -{ -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Allreduce(sendbuf, recvbuf, count, getMpiDtype(dtype), getMpiOp(op), mComm)); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -void MpiComm::allgather(void const* sendbuf, void* recvbuf, int count, MpiType dtype) const -{ -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Allgather(sendbuf, count, getMpiDtype(dtype), recvbuf, count, getMpiDtype(dtype), mComm)); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -void MpiComm::allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf, - std::vector const& recvcounts, std::vector const& displs, MpiType recvtype) const -{ -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Allgatherv(sendbuf, sendcount, getMpiDtype(sendtype), recvbuf, recvcounts.data(), displs.data(), - getMpiDtype(recvtype), mComm)); - -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const -{ -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Mprobe(source, tag, mComm, msg, status)); -#else - TLLM_THROW("Multi device support is disabled."); -#endif // ENABLE_MULTI_DEVICE -} - -bool MpiComm::improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const -{ -#if ENABLE_MULTI_DEVICE - int flag{0}; - MPICHECK(MPI_Improbe(source, tag, mComm, &flag, msg, status)); - return flag != 0; -#else - TLLM_THROW("Multi device support is disabled."); - return false; -#endif -} - -bool MpiComm::iprobe(int source, int tag, MPI_Status* status) const -{ -#if ENABLE_MULTI_DEVICE - int flag{0}; - MPICHECK(MPI_Iprobe(source, tag, mComm, &flag, status)); - return flag != 0; -#else - TLLM_THROW("Multi device support is disabled."); - return false; -#endif -} - -void MpiComm::recvPoll(int source, int tag, int periodMs) const -{ - MPI_Status status; - while (!iprobe(source, tag, &status)) - { - std::this_thread::sleep_for(std::chrono::milliseconds(periodMs)); - } -} - -int MpiComm::getRank() const -{ - int rank = 0; -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Comm_rank(mComm, &rank)); -#endif - return rank; -} - -int MpiComm::getSize() const -{ - int world_size = 1; -#if ENABLE_MULTI_DEVICE - MPICHECK(MPI_Comm_size(mComm, &world_size)); -#endif - return world_size; -} - -MpiComm const& MpiComm::world() -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - static MpiComm commWorld{MPI_COMM_WORLD, false}; - initialize(); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return commWorld; -} - -MpiComm& MpiComm::mutableSession() -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - static MpiComm commSession{MPI_COMM_WORLD, false}; - initialize(); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return commSession; -} - -MpiComm& MpiComm::mutableLocalSession() -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - static MpiComm localSession = initLocalSession(); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return localSession; -} - -void MpiComm::refreshLocalSession() -{ -#if ENABLE_MULTI_DEVICE - static std::mutex mutex; - std::unique_lock lock(mutex); - auto initSessionRanks = getWorldRanks(MpiComm::session()); - auto localSessionRanks = getWorldRanks(MpiComm::localSession()); - - // Add to intersectionRanks in order of initSessionRanks - std::vector intersectionRanks; - std::unordered_set localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end()); - for (auto rank : initSessionRanks) - { - if (localSessionRanksSet.find(rank) != localSessionRanksSet.end()) - { - intersectionRanks.push_back(rank); - } - } - - MPI_Group worldGroup = nullptr; - MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup)); - MPI_Group localGroup = nullptr; - MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup)); - MPI_Comm localComm = nullptr; - MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm)); - MpiComm::mutableLocalSession().mFreeComm = true; - MpiComm::mutableLocalSession() = MpiComm{localComm, false}; - TLLM_LOG_INFO("Refreshed the MPI local session"); -#endif // ENABLE_MULTI_DEVICE -} - -MpiComm::MpiComm(MPI_Comm g, bool freeComm) - : mComm{g} - , mFreeComm{freeComm} -{ - TLLM_CHECK(mComm != MPI_COMM_NULL); -} - -MpiComm::~MpiComm() noexcept -{ -#if ENABLE_MULTI_DEVICE - if (mFreeComm && mComm) - { - if (MPI_Comm_free(&mComm) != MPI_SUCCESS) - { - TLLM_LOG_ERROR("MPI_Comm_free failed"); - } - } -#endif // ENABLE_MULTI_DEVICE -} - -MpiComm::MpiComm(MpiComm&& comm) noexcept - : mComm{comm.mComm} - , mFreeComm{comm.mFreeComm} -{ - comm.mFreeComm = false; -} - -MpiComm& MpiComm::operator=(MpiComm&& comm) noexcept -{ - this->~MpiComm(); - mComm = comm.mComm; - mFreeComm = comm.mFreeComm; - comm.mFreeComm = false; - return *this; -} - -MpiWaitThread::MpiWaitThread(std::string name, std::function funcWait, std::function funcSetup) - : mName{name.c_str()} - , mFuncWait{funcWait} - , mFuncSetup{funcSetup} -{ - TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__); - mThread = std::make_unique(&MpiWaitThread::sideThread, this); - TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__); -} - -MpiWaitThread::~MpiWaitThread() -{ - TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__); - waitStop(); - mShouldExit.store(true); - notifyStart(); - mThread->join(); - mThread.reset(nullptr); - TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__); -} - -void MpiWaitThread::sideThread() -{ - if (mFuncSetup) - { - mFuncSetup(); - } - while (!mShouldExit.load()) - { - notifyStop(); - waitStart(); - mFuncWait(); - } -} - -void MpiWaitThread::waitStart() -{ - TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__); - std::unique_lock lock(mMutex); - mCondVar.wait(lock, [this] { return mRunning; }); - TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__); -} - -void MpiWaitThread::waitStop() -{ - TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__); - std::unique_lock lock(mMutex); - mCondVar.wait(lock, [this] { return !mRunning; }); - TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__); -} - -void MpiWaitThread::notifyStart() -{ - TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__); - std::lock_guard lock(mMutex); - mRunning = true; - mCondVar.notify_one(); - TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__); -} - -void MpiWaitThread::notifyStop() -{ - TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__); - std::lock_guard lock(mMutex); - mRunning = false; - mCondVar.notify_one(); - TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__); -} - -} // namespace tensorrt_llm::mpi diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h deleted file mode 100644 index 0a9d51975..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace tensorrt_llm::common::nvtx -{ -inline nvtx3::color nextColor() -{ -#ifndef NVTX_DISABLE - constexpr std::array kColors{nvtx3::color{0xff00ff00}, nvtx3::color{0xff0000ff}, nvtx3::color{0xffffff00}, - nvtx3::color{0xffff00ff}, nvtx3::color{0xff00ffff}, nvtx3::color{0xffff0000}, nvtx3::color{0xffffffff}}; - constexpr auto numColors = kColors.size(); - - static thread_local std::size_t colorId = 0; - auto const color = kColors[colorId]; - colorId = colorId + 1 >= numColors ? 0 : colorId + 1; - return color; -#else - return nvtx3::color{0}; -#endif -} - -} // namespace tensorrt_llm::common::nvtx - -#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \ - ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name) -#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range) diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp deleted file mode 100644 index 39aefda48..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp +++ /dev/null @@ -1,323 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "tensorrt_llm/common/opUtils.h" -#include "tensorrt_llm/common/mpiUtils.h" - -#include "cuda.h" -#include -#include -#include -#include -#include -#include -#include - -#ifdef _MSC_VER -#define FN_NAME __FUNCTION__ -#else -#define FN_NAME __func__ -#endif - -#if ENABLE_MULTI_DEVICE - -std::unordered_map* getDtypeMap() -{ - static std::unordered_map dtypeMap = {{nvinfer1::DataType::kFLOAT, ncclFloat32}, - {nvinfer1::DataType::kHALF, ncclFloat16}, {nvinfer1::DataType::kBF16, ncclBfloat16}}; - return &dtypeMap; -} - -namespace -{ - -// Get NCCL unique ID for a group of ranks. -ncclUniqueId getUniqueId(std::set const& group) noexcept -{ - auto const rank = COMM_SESSION.getRank(); - TLLM_LOG_TRACE("%s start for rank %d", __PRETTY_FUNCTION__, rank); - ncclUniqueId id; - if (rank == *group.begin()) - { - NCCLCHECK(ncclGetUniqueId(&id)); - for (auto it = std::next(std::begin(group), 1); it != group.end(); ++it) - { - COMM_SESSION.sendValue(id, *it, 0); - } - } - else - { - COMM_SESSION.recvValue(id, *group.begin(), 0); - } - TLLM_LOG_TRACE("%s stop for rank %d", __PRETTY_FUNCTION__, rank); - return id; -} -} // namespace - -std::shared_ptr getComm(std::set const& group) -{ - auto const rank = COMM_SESSION.getRank(); - TLLM_LOG_TRACE("%s start for rank %d", __PRETTY_FUNCTION__, rank); - static std::map, std::shared_ptr> commMap; - static std::mutex mutex; - std::lock_guard lock(mutex); - std::ostringstream oss; - int index = 0; - for (auto const& rank : group) - { - if (index != 0) - { - oss << ","; - } - oss << rank; - index++; - } - auto groupStr = oss.str(); - auto it = commMap.find(group); - if (it != commMap.end()) - { - auto ncclComm = it->second; - TLLM_LOG_TRACE("NCCL comm for group(%s) is cached for rank %d", groupStr.c_str(), rank); - return ncclComm; - } - - TLLM_LOG_TRACE("Init NCCL comm for group(%s) for rank %d", groupStr.c_str(), rank); - ncclUniqueId id = getUniqueId(group); - int groupRank = 0; - for (auto const& currentRank : group) - { - if (rank == currentRank) - break; - ++groupRank; - } - TLLM_CHECK(groupRank < group.size()); - std::shared_ptr ncclComm(new ncclComm_t, - [](ncclComm_t* comm) - { - ncclCommDestroy(*comm); - delete comm; - }); - NCCLCHECK(ncclCommInitRank(ncclComm.get(), group.size(), id, groupRank)); - commMap[group] = ncclComm; - TLLM_LOG_TRACE("%s stop for rank %d", __PRETTY_FUNCTION__, rank); - return ncclComm; -} -#endif // ENABLE_MULTI_DEVICE - -void const* tensorrt_llm::common::getCommSessionHandle() -{ -#if ENABLE_MULTI_DEVICE - return &COMM_SESSION; -#else - return nullptr; -#endif // ENABLE_MULTI_DEVICE -} - -namespace -{ - -// Get current cuda context, a default context will be created if there is no context. -inline CUcontext getCurrentCudaCtx() -{ - CUcontext ctx{}; - CUresult err = cuCtxGetCurrent(&ctx); - if (err == CUDA_ERROR_NOT_INITIALIZED || ctx == nullptr) - { - TLLM_CUDA_CHECK(cudaFree(nullptr)); - err = cuCtxGetCurrent(&ctx); - } - TLLM_CHECK(err == CUDA_SUCCESS); - return ctx; -} - -// Helper to create per-cuda-context singleton managed by std::shared_ptr. -// Unlike conventional singletons, singleton created with this will be released -// when not needed, instead of on process exit. -// Objects of this class shall always be declared static / global, and shall never own CUDA -// resources. -template -class PerCudaCtxSingletonCreator -{ -public: - using CreatorFunc = std::function()>; - using DeleterFunc = std::function; - - // creator returning std::unique_ptr is by design. - // It forces separation of memory for T and memory for control blocks. - // So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released. - // creator itself must not own CUDA resources. Only the object it creates can. - PerCudaCtxSingletonCreator(CreatorFunc creator, DeleterFunc deleter) - : mCreator{std::move(creator)} - , mDeleter{std::move(deleter)} - { - } - - std::shared_ptr operator()() - { - std::lock_guard lk{mMutex}; - CUcontext ctx{getCurrentCudaCtx()}; - std::shared_ptr result = mObservers[ctx].lock(); - if (result == nullptr) - { - // Create the resource and register with an observer. - result = std::shared_ptr{mCreator().release(), - [this, ctx](T* obj) - { - if (obj == nullptr) - { - return; - } - mDeleter(obj); - - // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts - // frequently. - std::shared_ptr observedObjHolder; // Delay destroy to avoid dead lock. - std::lock_guard lk{mMutex}; - // Must check observer again because another thread may created new instance for this ctx just - // before we lock mMutex. We can't infer that the observer is stale from the fact that obj is - // destroyed, because shared_ptr ref-count checking and observer removing are not in one atomic - // operation, and the observer may be changed to observe another instance. - observedObjHolder = mObservers.at(ctx).lock(); - if (observedObjHolder == nullptr) - { - mObservers.erase(ctx); - } - }}; - mObservers.at(ctx) = result; - } - return result; - } - -private: - CreatorFunc mCreator; - DeleterFunc mDeleter; - mutable std::mutex mMutex; - // CUDA resources are per-context. - std::unordered_map> mObservers; -}; - -template -class PerThreadSingletonCreator -{ -public: - using CreatorFunc = std::function()>; - using DeleterFunc = std::function; - - // creator returning std::unique_ptr is by design. - // It forces separation of memory for T and memory for control blocks. - // So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released. - // creator itself must not own CUDA resources. Only the object it creates can. - PerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter) - : mCreator{std::move(creator)} - , mDeleter{std::move(deleter)} - { - } - - std::shared_ptr operator()() - { - std::lock_guard lk{mMutex}; - - std::thread::id thread = std::this_thread::get_id(); - std::shared_ptr result = mObservers[thread].lock(); - - if (result == nullptr) - { - // Create the resource and register with an observer. - result = std::shared_ptr{mCreator().release(), - [this, thread](T* obj) - { - if (obj == nullptr) - { - return; - } - mDeleter(obj); - - // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts - // frequently. - std::shared_ptr observedObjHolder; // Delay destroy to avoid dead lock. - std::lock_guard lk{mMutex}; - // Must check observer again because another thread may created new instance for this ctx just - // before we lock mMutex. We can't infer that the observer is stale from the fact that obj is - // destroyed, because shared_ptr ref-count checking and observer removing are not in one atomic - // operation, and the observer may be changed to observe another instance. - observedObjHolder = mObservers.at(thread).lock(); - if (observedObjHolder == nullptr) - { - mObservers.erase(thread); - } - }}; - mObservers.at(thread) = result; - } - return result; - } - -private: - CreatorFunc mCreator; - DeleterFunc mDeleter; - mutable std::mutex mMutex; - // CUDA resources are per-thread. - std::unordered_map> mObservers; -}; - -} // namespace - -std::shared_ptr getCublasHandle() -{ - static PerThreadSingletonCreator creator( - []() -> auto - { - auto handle = std::unique_ptr(new cublasHandle_t); - TLLM_CUDA_CHECK(cublasCreate(handle.get())); - return handle; - }, - [](cublasHandle_t* handle) - { - TLLM_CUDA_CHECK(cublasDestroy(*handle)); - delete handle; - }); - return creator(); -} - -std::shared_ptr getCublasLtHandle() -{ - static PerThreadSingletonCreator creator( - []() -> auto - { - auto handle = std::unique_ptr(new cublasLtHandle_t); - TLLM_CUDA_CHECK(cublasLtCreate(handle.get())); - return handle; - }, - [](cublasLtHandle_t* handle) - { - TLLM_CUDA_CHECK(cublasLtDestroy(*handle)); - delete handle; - }); - return creator(); -} - -std::shared_ptr getCublasMMWrapper(std::shared_ptr cublasHandle, - std::shared_ptr cublasltHandle, cudaStream_t stream, void* workspace) -{ - static PerThreadSingletonCreator creator( - [cublasHandle, cublasltHandle, stream, workspace]() -> auto - { - auto wrapper = std::unique_ptr( - new tensorrt_llm::common::CublasMMWrapper(cublasHandle, cublasltHandle, stream, workspace)); - return wrapper; - }, - [](tensorrt_llm::common::CublasMMWrapper* wrapper) { delete wrapper; }); - return creator(); -} diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h deleted file mode 100644 index 4e278e5cf..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/common/cublasMMWrapper.h" -#include "tensorrt_llm/common/workspace.h" - -#include -#include -#include -#include -#if ENABLE_MULTI_DEVICE -#include -#endif // ENABLE_MULTI_DEVICE - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace tensorrt_llm::common -{ - -// Write values into buffer -template -void write(char*& buffer, T const& val) -{ - std::memcpy(buffer, &val, sizeof(T)); - buffer += sizeof(T); -} - -// Read values from buffer -template -void read(char const*& buffer, T& val) -{ - std::memcpy(&val, buffer, sizeof(T)); - buffer += sizeof(T); -} - -// Like std::unique_ptr, but does not prevent generation of default copy constructor when used as class members. -// The copy constructor produces nullptr. So the plugin default copy constructor will not really copy this, and -// your clone() implementation is responsible for initializing such data members. -// With this we can simplify clone() implementation when there are many data members including at least one unique_ptr. -template > -class UniqPtrWNullCopy : public std::unique_ptr -{ -public: - using std::unique_ptr::unique_ptr; - - // for compatibility with std::make_unique - explicit UniqPtrWNullCopy(std::unique_ptr&& src) - : std::unique_ptr::unique_ptr{std::move(src)} - { - } - - // copy constructor produces nullptr - UniqPtrWNullCopy(UniqPtrWNullCopy const&) - : std::unique_ptr::unique_ptr{} - { - } -}; - -// for testing only -void const* getCommSessionHandle(); -} // namespace tensorrt_llm::common - -inline bool isBuilding() -{ - auto constexpr key = "IS_BUILDING"; - auto const val = getenv(key); - return val != nullptr && std::string(val) == "1"; -} - -#if ENABLE_MULTI_DEVICE -#define NCCLCHECK(cmd) \ - do \ - { \ - ncclResult_t r = cmd; \ - if (r != ncclSuccess) \ - { \ - printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -std::unordered_map* getDtypeMap(); - -std::shared_ptr getComm(std::set const& group); - -#endif // ENABLE_MULTI_DEVICE - -//! To save GPU memory, all the plugins share the same cublas and cublasLt handle globally. -//! Get cublas and cublasLt handle for current cuda context -std::shared_ptr getCublasHandle(); -std::shared_ptr getCublasLtHandle(); -std::shared_ptr getCublasMMWrapper(std::shared_ptr cublasHandle, - std::shared_ptr cublasltHandle, cudaStream_t stream, void* workspace); - -#ifndef DEBUG - -#define PLUGIN_CHECK(status) \ - do \ - { \ - if (status != 0) \ - abort(); \ - } while (0) - -#define ASSERT_PARAM(exp) \ - do \ - { \ - if (!(exp)) \ - return STATUS_BAD_PARAM; \ - } while (0) - -#define ASSERT_FAILURE(exp) \ - do \ - { \ - if (!(exp)) \ - return STATUS_FAILURE; \ - } while (0) - -#define CSC(call, err) \ - do \ - { \ - cudaError_t cudaStatus = call; \ - if (cudaStatus != cudaSuccess) \ - { \ - return err; \ - } \ - } while (0) - -#define DEBUG_PRINTF(...) \ - do \ - { \ - } while (0) - -#else - -#define ASSERT_PARAM(exp) \ - do \ - { \ - if (!(exp)) \ - { \ - fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \ - return STATUS_BAD_PARAM; \ - } \ - } while (0) - -#define ASSERT_FAILURE(exp) \ - do \ - { \ - if (!(exp)) \ - { \ - fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \ - return STATUS_FAILURE; \ - } \ - } while (0) - -#define CSC(call, err) \ - do \ - { \ - cudaError_t cudaStatus = call; \ - if (cudaStatus != cudaSuccess) \ - { \ - printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \ - return err; \ - } \ - } while (0) - -#define PLUGIN_CHECK(status) \ - { \ - if (status != 0) \ - { \ - DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \ - abort(); \ - } \ - } - -#define DEBUG_PRINTF(...) \ - do \ - { \ - printf(__VA_ARGS__); \ - } while (0) - -#endif // DEBUG - -#define NVML_CHECK(cmd) \ - do \ - { \ - nvmlReturn_t r = cmd; \ - if (r != NVML_SUCCESS) \ - { \ - printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h b/sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h new file mode 100644 index 000000000..052d9c8c8 --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace tensorrt_llm +{ +namespace common +{ + +class QuantMode +{ + // [WARNING] KEEP BELOW DEFINITION IN SYNC WITH tensorrt_llm/quantization/mode.py +public: + using BaseType = std::uint32_t; + + explicit constexpr QuantMode(BaseType value) noexcept + : mValue{value} + { + } + + QuantMode() noexcept = default; + + constexpr QuantMode(QuantMode const&) noexcept = default; + + constexpr QuantMode& operator=(QuantMode const& other) noexcept = default; + + static constexpr QuantMode none() noexcept + { + return QuantMode(BaseType(0)); + } + + static constexpr QuantMode int4Weights() noexcept + { + return QuantMode(BaseType(1u) << 0); + } + + static constexpr QuantMode int8Weights() noexcept + { + return QuantMode(BaseType(1u) << 1); + } + + static constexpr QuantMode activations() noexcept + { + return QuantMode(BaseType(1u) << 2); + } + + static constexpr QuantMode perChannelScaling() noexcept + { + return QuantMode(BaseType(1u) << 3); + } + + static constexpr QuantMode perTokenScaling() noexcept + { + return QuantMode(BaseType(1u) << 4); + } + + static constexpr QuantMode perGroupScaling() noexcept + { + return QuantMode(BaseType(1u) << 5); + } + + static constexpr QuantMode int8KvCache() noexcept + { + return QuantMode(BaseType(1u) << 6); + } + + static constexpr QuantMode fp8KvCache() noexcept + { + return QuantMode(BaseType(1u) << 7); + } + + static constexpr QuantMode fp8Qdq() noexcept + { + return QuantMode(BaseType(1u) << 8); + } + + static constexpr QuantMode fp8RowWise() noexcept + { + return QuantMode(BaseType(1u) << 3 | BaseType(1u) << 4 | BaseType(1u) << 9); + } + + static constexpr QuantMode w4a8QServe() noexcept + { + return QuantMode(BaseType(1u) << 10); + } + + constexpr BaseType value() const noexcept + { + return mValue; + } + + constexpr bool isSet(QuantMode const& mode) const noexcept + { + return (mValue & mode.value()) == mode.value(); + } + + constexpr bool hasInt4Weights() const noexcept + { + return isSet(int4Weights()); + } + + constexpr bool hasInt8Weights() const noexcept + { + return isSet(int8Weights()); + } + + constexpr bool hasActivations() const noexcept + { + return isSet(activations()); + } + + constexpr bool hasPerChannelScaling() const noexcept + { + return isSet(perChannelScaling()); + } + + constexpr bool hasPerTokenScaling() const noexcept + { + return isSet(perTokenScaling()); + } + + constexpr bool hasPerGroupScaling() const noexcept + { + return isSet(perGroupScaling()); + } + + constexpr bool hasStaticActivationScaling() const noexcept + { + return !hasPerTokenScaling(); + } + + constexpr bool hasInt8KvCache() const noexcept + { + return isSet(int8KvCache()); + } + + constexpr bool hasFp8KvCache() const noexcept + { + return isSet(fp8KvCache()); + } + + constexpr bool hasFp8Qdq() const noexcept + { + return isSet(fp8Qdq()); + } + + constexpr bool hasFp8RowWise() const noexcept + { + return isSet(fp8RowWise()); + } + + constexpr bool hasKvCacheQuant() const noexcept + { + return hasInt8KvCache() || hasFp8KvCache(); + } + + static constexpr QuantMode fromDescription(bool quantizeWeights = false, bool quantizeActivations = false, + bool perToken = false, bool perChannel = false, bool perGroup = false, bool useInt4Weights = false, + bool useInt8KvCache = false, bool useFp8KvCache = false, bool useFp8Qdq = false, bool useFp8RowWise = false, + bool useW4a8QServe = false) + { + QuantMode quantMode{}; + if (quantizeWeights) + { + if (useInt4Weights) + quantMode += int4Weights(); + else + quantMode += int8Weights(); + } + + if (quantizeActivations) + { + quantMode += activations(); + } + + if (perChannel) + { + quantMode += QuantMode::perChannelScaling(); + } + if (perToken) + { + quantMode += QuantMode::perTokenScaling(); + } + if (perGroup) + { + quantMode += QuantMode::perGroupScaling(); + } + + if (useInt8KvCache) + { + quantMode += int8KvCache(); + } + + if (useFp8KvCache) + { + quantMode += fp8KvCache(); + } + + if (useFp8Qdq) + { + quantMode += fp8Qdq(); + } + + if (useFp8RowWise) + { + quantMode += fp8RowWise(); + } + + if (useW4a8QServe) + { + quantMode += w4a8QServe(); + } + + return quantMode; + } + + static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false) + { + return fromDescription(true, true, perToken, perChannel); + } + + static constexpr QuantMode useQServe(bool perGroup) + { + return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true); + } + + static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false) + { + return fromDescription(true, false, false, false, perGroup, useInt4Weights); + } + + static QuantMode const fromQuantAlgo( + std::optional quantAlgo = std::nullopt, std::optional kvCacheQuantAlgo = std::nullopt) + { + QuantMode quantMode{}; + if (quantAlgo == "W8A16") + { + quantMode = useWeightOnly(false, false); + } + else if (quantAlgo == "W4A16") + { + quantMode = useWeightOnly(true, false); + } + else if (quantAlgo == "W4A16_AWQ") + { + quantMode = useWeightOnly(true, true); + } + else if (quantAlgo == "W4A8_AWQ") + { + quantMode = useWeightOnly(true, true); + } + else if (quantAlgo == "W4A8_QSERVE_PER_GROUP") + { + quantMode = useQServe(false); + } + else if (quantAlgo == "W4A8_QSERVE_PER_CHANNEL") + { + quantMode = useQServe(true); + } + else if (quantAlgo == "W4A16_GPTQ") + { + quantMode = useWeightOnly(true, true); + } + else if (quantAlgo == "W8A8_SQ_PER_CHANNEL") + { + quantMode = useSmoothQuant(false, true); + } + else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PLUGIN") + { + quantMode = useSmoothQuant(false, false); + } + else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN") + { + quantMode = useSmoothQuant(true, true); + } + else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN") + { + quantMode = useSmoothQuant(false, true); + } + else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN") + { + quantMode = useSmoothQuant(true, false); + } + else if (quantAlgo == "FP8") + { + quantMode = fromDescription(false, false, false, false, false, false, false, false, true); + } + else if (quantAlgo == "FP8_ROWWISE") + { + quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true); + } + + if (kvCacheQuantAlgo == "INT8") + { + quantMode += int8KvCache(); + } + else if (kvCacheQuantAlgo == "FP8") + { + quantMode += fp8KvCache(); + } + + return quantMode; + } + + constexpr QuantMode operator+(QuantMode const& other) const noexcept + { + return QuantMode(mValue | other.mValue); + } + + constexpr QuantMode& operator+=(QuantMode const& other) noexcept + { + return *this = *this + other; + } + + constexpr QuantMode operator-(QuantMode const& other) const noexcept + { + return QuantMode(mValue & ~other.mValue); + } + + constexpr QuantMode& operator-=(QuantMode const& other) noexcept + { + return *this = *this - other; + } + + constexpr bool operator==(QuantMode const& other) const noexcept + { + return mValue == other.mValue; + } + + constexpr bool operator!=(QuantMode const& other) const noexcept + { + return !(*this == other); + } + +private: + BaseType mValue{0}; +}; + +} // namespace common +} // namespace tensorrt_llm diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h deleted file mode 100644 index 9cda9fa0d..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace tensorrt_llm::common::stl_utils -{ - -template -constexpr TOutputIt basicInclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, TBinOp op) -{ - if (first != last) - { - auto val = *first; - while (true) - { - *dFirst = val; - ++dFirst; - ++first; - if (first == last) - { - break; - } - val = op(std::move(val), *first); - } - } - return dFirst; -} - -template -constexpr TOutputIt inclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst) -{ -#if defined(__GNUC__) && __GNUC__ <= 8 - return basicInclusiveScan(first, last, dFirst, std::plus<>{}); -#else - return std::inclusive_scan(first, last, dFirst); -#endif -} - -template -constexpr TOutputIt basicExclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, T init, TBinOp op) -{ - if (first != last) - { - while (true) - { - T tmp{op(init, *first)}; - *dFirst = init; - ++dFirst; - ++first; - if (first == last) - { - break; - } - init = std::move(tmp); - } - } - return dFirst; -} - -template -constexpr TOutputIt exclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, T init) -{ -#if defined(__GNUC__) && __GNUC__ <= 8 - return basicExclusiveScan(first, last, dFirst, std::move(init), std::plus<>{}); -#else - return std::exclusive_scan(first, last, dFirst, std::move(init)); -#endif -} - -template -struct HasOperatorOutput : std::false_type -{ -}; - -template -struct HasOperatorOutput() << std::declval()))>> - : std::true_type -{ -}; - -template -std::string toString(T const& t, typename std::enable_if_t::value, int> = 0) -{ - std::ostringstream oss; - oss << t; - return oss.str(); -} - -template -std::string toString(std::optional const& t, typename std::enable_if_t::value, int> = 0) -{ - std::ostringstream oss; - if (t) - { - oss << t.value(); - } - else - { - oss << "None"; - } - return oss.str(); -} - -} // namespace tensorrt_llm::common::stl_utils diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h new file mode 100644 index 000000000..9c5ecde98 --- /dev/null +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if ENABLE_BF16 +#include +#endif // ENABLE_BF16 +#include + +#include // std::make_unique +#include // std::stringstream +#include +#include +#include + +namespace tensorrt_llm::common +{ +#if ENABLE_BF16 +static inline std::basic_ostream& operator<<(std::basic_ostream& stream, __nv_bfloat16 const& val) +{ + stream << __bfloat162float(val); + return stream; +} +#endif // ENABLE_BF16 + +static inline std::basic_ostream& operator<<(std::basic_ostream& stream, __half const& val) +{ + stream << __half2float(val); + return stream; +} + +inline std::string fmtstr(std::string const& s) +{ + return s; +} + +inline std::string fmtstr(std::string&& s) +{ + return s; +} + +#if defined(_MSC_VER) +std::string fmtstr(char const* format, ...); +#else +std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2))); +#endif + +// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows +// The alternative is __FUNCSIG__, which is similar but not identical +#if defined(_WIN32) +#define __PRETTY_FUNCTION__ __FUNCSIG__ +#endif + +auto constexpr kDefaultDelimiter = ", "; + +template +inline TStream& arr2outCasted(TStream& out, T* arr, size_t size, char const* delim = kDefaultDelimiter) +{ + out << "("; + if (size > 0) + { + for (size_t i = 0; i < size - 1; ++i) + { + out << static_cast(arr[i]) << delim; + } + out << static_cast(arr[size - 1]); + } + out << ")"; + return out; +} + +template +inline TStream& arr2out(TStream& out, T* arr, size_t size, char const* delim = kDefaultDelimiter) +{ + return arr2outCasted(out, arr, size, delim); +} + +template +inline std::string arr2str(T* arr, size_t size, char const* delim = kDefaultDelimiter) +{ + std::stringstream ss; + return arr2out(ss, arr, size, delim).str(); +} + +template +inline std::string vec2str(std::vector const& vec, char const* delim = kDefaultDelimiter) +{ + return arr2str(vec.data(), vec.size(), delim); +} + +inline bool strStartsWith(std::string const& str, std::string const& prefix) +{ + return str.rfind(prefix, 0) == 0; +} + +/// @brief Split a string into a set of strings using a delimiter +std::unordered_set str2set(std::string const& input, char delimiter); + +} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp deleted file mode 100644 index c00041abd..000000000 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include "tensorrt_llm/common/timestampUtils.h" - -namespace tensorrt_llm::common -{ - -std::string getCurrentTimestamp() -{ - auto now = std::chrono::system_clock::now(); - auto now_t = std::chrono::system_clock::to_time_t(now); - auto tm = *std::localtime(&now_t); - - auto epoch_to_now = now.time_since_epoch(); - auto seconds = std::chrono::duration_cast(epoch_to_now); - auto us = std::chrono::duration_cast(epoch_to_now - seconds); - - std::ostringstream stream; - stream << std::put_time(&tm, "%m-%d-%Y %H:%M:%S"); - stream << "." << std::setfill('0') << std::setw(6) << us.count(); - return stream.str(); -} - -} // namespace tensorrt_llm::common diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h similarity index 50% rename from sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.h rename to sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h index f52f23028..47e0e63d3 100644 --- a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.h +++ b/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h @@ -14,12 +14,35 @@ * limitations under the License. */ +#pragma once + +#include +#include +#include #include +#define NEW_TLLM_EXCEPTION(...) \ + tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__)) + namespace tensorrt_llm::common { -/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu" -std::string getCurrentTimestamp(); +class TllmException : public std::runtime_error +{ +public: + static auto constexpr MAX_FRAMES = 128; + + explicit TllmException(char const* file, std::size_t line, std::string const& msg); + + ~TllmException() noexcept override; + + [[nodiscard]] std::string getTrace() const; + + static std::string demangle(char const* name); + +private: + std::array mCallstack{}; + int mNbFrames; +}; } // namespace tensorrt_llm::common