keep the parts needed for moe_kernels (#3218)
This commit is contained in:
@@ -1,22 +0,0 @@
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
|
||||
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy of
|
||||
# the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations under
|
||||
# the License.
|
||||
#
|
||||
file(GLOB SRCS *.cpp)
|
||||
file(GLOB CU_SRCS *.cu)
|
||||
|
||||
add_library(common_src OBJECT ${SRCS} ${CU_SRCS})
|
||||
set_property(TARGET common_src PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_property(TARGET common_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
0
sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
vendored
Executable file → Normal file
0
sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
vendored
Executable file → Normal file
92
sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
vendored
Normal file
92
sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
|
||||
{
|
||||
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()));
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
class DebugConfig
|
||||
{
|
||||
public:
|
||||
static bool isCheckDebugEnabled();
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
|
||||
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
|
||||
#else
|
||||
#define TLLM_LIKELY(x) __builtin_expect((x), 1)
|
||||
#define TLLM_UNLIKELY(x) __builtin_expect((x), 0)
|
||||
#endif
|
||||
|
||||
#define TLLM_CHECK(val) \
|
||||
do \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_WITH_INFO(val, info, ...) \
|
||||
do \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError( \
|
||||
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__)); \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_DEBUG(val) \
|
||||
do \
|
||||
{ \
|
||||
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError( \
|
||||
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_THROW(...) \
|
||||
do \
|
||||
{ \
|
||||
throw NEW_TLLM_EXCEPTION(__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_WRAP(ex) \
|
||||
NEW_TLLM_EXCEPTION("%s: %s", tensorrt_llm::common::TllmException::demangle(typeid(ex).name()).c_str(), ex.what())
|
||||
187
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
vendored
Normal file
187
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#define CUDA_LIB_NAME "cuda"
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#define dllOpen(name) LoadLibrary("nv" name ".dll")
|
||||
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
|
||||
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
|
||||
#else // For non-Windows platforms
|
||||
#include <dlfcn.h>
|
||||
#define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY)
|
||||
#define dllClose(handle) dlclose(handle)
|
||||
#define dllGetSym(handle, name) dlsym(handle, name)
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
#include "cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include <cstdio>
|
||||
#include <cuda.h>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
|
||||
{
|
||||
static std::mutex mutex;
|
||||
static std::weak_ptr<CUDADriverWrapper> instance;
|
||||
std::shared_ptr<CUDADriverWrapper> result = instance.lock();
|
||||
if (result)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
result = instance.lock();
|
||||
if (!result)
|
||||
{
|
||||
result = std::shared_ptr<CUDADriverWrapper>(new CUDADriverWrapper());
|
||||
instance = result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
CUDADriverWrapper::CUDADriverWrapper()
|
||||
: handle(dllOpen(CUDA_LIB_NAME))
|
||||
{
|
||||
|
||||
TLLM_CHECK_WITH_INFO(handle != nullptr, "CUDA driver library is not open correctly.");
|
||||
|
||||
auto load_sym = [](void* handle, char const* name)
|
||||
{
|
||||
void* ret = dllGetSym(handle, name);
|
||||
return ret;
|
||||
};
|
||||
|
||||
*reinterpret_cast<void**>(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
|
||||
*reinterpret_cast<void**>(&_cuGetErrorMessage) = load_sym(handle, "cuGetErrorMessage");
|
||||
*reinterpret_cast<void**>(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
|
||||
*reinterpret_cast<void**>(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
|
||||
*reinterpret_cast<void**>(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
|
||||
*reinterpret_cast<void**>(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
|
||||
*reinterpret_cast<void**>(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
|
||||
*reinterpret_cast<void**>(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
|
||||
*reinterpret_cast<void**>(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
|
||||
*reinterpret_cast<void**>(&_cuModuleGetGlobal) = load_sym(handle, "cuModuleGetGlobal_v2");
|
||||
*reinterpret_cast<void**>(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
|
||||
*reinterpret_cast<void**>(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
|
||||
*reinterpret_cast<void**>(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
|
||||
*reinterpret_cast<void**>(&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
|
||||
*reinterpret_cast<void**>(&_cuTensorMapEncodeTiled) = load_sym(handle, "cuTensorMapEncodeTiled");
|
||||
*reinterpret_cast<void**>(&_cuMemcpyDtoH) = load_sym(handle, "cuMemcpyDtoH_v2");
|
||||
}
|
||||
|
||||
CUDADriverWrapper::~CUDADriverWrapper()
|
||||
{
|
||||
dllClose(handle);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, char const** pStr) const
|
||||
{
|
||||
return (*_cuGetErrorName)(error, pStr);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuGetErrorMessage(CUresult error, char const** pStr) const
|
||||
{
|
||||
return (*_cuGetErrorMessage)(error, pStr);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const
|
||||
{
|
||||
return (*_cuFuncSetAttribute)(hfunc, attrib, value);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
|
||||
{
|
||||
return (*_cuLinkComplete)(state, cubinOut, sizeOut);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
|
||||
{
|
||||
return (*_cuModuleUnload)(hmod);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
|
||||
{
|
||||
return (*_cuLinkDestroy)(state);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, void const* image) const
|
||||
{
|
||||
return (*_cuModuleLoadData)(module, image);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLinkCreate(
|
||||
unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
|
||||
{
|
||||
return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, char const* name) const
|
||||
{
|
||||
return (*_cuModuleGetFunction)(hfunc, hmod, name);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, char const* name) const
|
||||
{
|
||||
return (*_cuModuleGetGlobal)(dptr, bytes, hmod, name);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, char const* path,
|
||||
unsigned int numOptions, CUjit_option* options, void** optionValues) const
|
||||
{
|
||||
return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size,
|
||||
char const* name, unsigned int numOptions, CUjit_option* options, void** optionValues) const
|
||||
{
|
||||
return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
|
||||
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
||||
unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const
|
||||
{
|
||||
return (*_cuLaunchCooperativeKernel)(
|
||||
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
|
||||
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
||||
unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) const
|
||||
{
|
||||
return (*_cuLaunchKernel)(
|
||||
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType,
|
||||
cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
|
||||
cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
|
||||
CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const
|
||||
{
|
||||
return (*_cuTensorMapEncodeTiled)(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, globalStrides,
|
||||
boxDim, elementStrides, interleave, swizzle, l2Promotion, oobFill);
|
||||
}
|
||||
|
||||
CUresult CUDADriverWrapper::cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const
|
||||
{
|
||||
return (*_cuMemcpyDtoH)(dstHost, srcDevice, ByteCount);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
138
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
vendored
Normal file
138
sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef CUDA_DRIVER_WRAPPER_H
|
||||
#define CUDA_DRIVER_WRAPPER_H
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include <cstdio>
|
||||
#include <cuda.h>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
class CUDADriverWrapper
|
||||
{
|
||||
public:
|
||||
static std::shared_ptr<CUDADriverWrapper> getInstance();
|
||||
|
||||
~CUDADriverWrapper();
|
||||
CUDADriverWrapper(CUDADriverWrapper const&) = delete;
|
||||
CUDADriverWrapper operator=(CUDADriverWrapper const&) = delete;
|
||||
CUDADriverWrapper(CUDADriverWrapper&&) = delete;
|
||||
CUDADriverWrapper operator=(CUDADriverWrapper&&) = delete;
|
||||
|
||||
CUresult cuGetErrorName(CUresult error, char const** pStr) const;
|
||||
|
||||
CUresult cuGetErrorMessage(CUresult error, char const** pStr) const;
|
||||
|
||||
CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;
|
||||
|
||||
CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;
|
||||
|
||||
CUresult cuModuleUnload(CUmodule hmod) const;
|
||||
|
||||
CUresult cuLinkDestroy(CUlinkState state) const;
|
||||
|
||||
CUresult cuModuleLoadData(CUmodule* module, void const* image) const;
|
||||
|
||||
CUresult cuLinkCreate(
|
||||
unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;
|
||||
|
||||
CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, char const* name) const;
|
||||
|
||||
CUresult cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, char const* name) const;
|
||||
|
||||
CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, char const* path, unsigned int numOptions,
|
||||
CUjit_option* options, void** optionValues) const;
|
||||
|
||||
CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, char const* name,
|
||||
unsigned int numOptions, CUjit_option* options, void** optionValues) const;
|
||||
|
||||
CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
|
||||
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
||||
unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const;
|
||||
|
||||
CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
||||
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
|
||||
CUstream hStream, void** kernelParams, void** extra) const;
|
||||
|
||||
CUresult cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank,
|
||||
void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides, cuuint32_t const* boxDim,
|
||||
cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
|
||||
CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const;
|
||||
|
||||
CUresult cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const;
|
||||
|
||||
private:
|
||||
void* handle;
|
||||
CUDADriverWrapper();
|
||||
|
||||
CUresult (*_cuGetErrorName)(CUresult, char const**);
|
||||
CUresult (*_cuGetErrorMessage)(CUresult, char const**);
|
||||
CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
|
||||
CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
|
||||
CUresult (*_cuModuleUnload)(CUmodule);
|
||||
CUresult (*_cuLinkDestroy)(CUlinkState);
|
||||
CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
|
||||
CUresult (*_cuModuleLoadData)(CUmodule*, void const*);
|
||||
CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, char const*);
|
||||
CUresult (*_cuModuleGetGlobal)(CUdeviceptr*, size_t*, CUmodule, char const*);
|
||||
CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, char const*, unsigned int, CUjit_option*, void**);
|
||||
CUresult (*_cuLinkAddData)(
|
||||
CUlinkState, CUjitInputType, void*, size_t, char const*, unsigned int, CUjit_option*, void**);
|
||||
CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
|
||||
unsigned int, unsigned int, unsigned int, CUstream, void**);
|
||||
CUresult (*_cuLaunchKernel)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
||||
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
|
||||
CUstream hStream, void** kernelParams, void** extra);
|
||||
CUresult (*_cuTensorMapEncodeTiled)(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType,
|
||||
cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
|
||||
cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
|
||||
CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
|
||||
CUresult (*_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void checkDriver(
|
||||
T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file, int const line)
|
||||
{
|
||||
if (result)
|
||||
{
|
||||
char const* errorName = nullptr;
|
||||
char const* errorMsg = nullptr;
|
||||
wrap.cuGetErrorName(result, &errorName);
|
||||
wrap.cuGetErrorMessage(result, &errorMsg);
|
||||
throw TllmException(
|
||||
file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s", func, errorName, errorMsg));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
/*
|
||||
* Macros compliant with TensorRT coding conventions
|
||||
*/
|
||||
#define TLLM_CU_CHECK(stat) \
|
||||
do \
|
||||
{ \
|
||||
tensorrt_llm::common::checkDriver( \
|
||||
(stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
#endif // CUDA_DRIVER_WRAPPER_H
|
||||
239
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
vendored
Normal file
239
sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
vendored
Normal file
@@ -0,0 +1,239 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
#include <cuda_fp8.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define FP8_MHA
|
||||
#define FUSE_GEMM_ACT
|
||||
#define FP8_GEMM_OUTPUT_QUANT_DISABLE
|
||||
|
||||
#ifdef FUSE_GEMM_ACT
|
||||
#define USE_QGMMA
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
constexpr float FP8_E4M3_MAX = 448.0f;
|
||||
|
||||
enum QuantizeMode
|
||||
{
|
||||
PER_CHANNEL,
|
||||
PER_TENSOR,
|
||||
PER_CHANNEL_WEIGHT_PER_TENSOR_ACT,
|
||||
PER_TOKEN,
|
||||
};
|
||||
|
||||
// Packed Data Type
|
||||
typedef struct __CUDA_ALIGN__(32)
|
||||
{
|
||||
float array[8];
|
||||
} float8;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(16)
|
||||
{
|
||||
half array[8];
|
||||
} half8;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
half2 array[2];
|
||||
} half2_2;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
half array[4];
|
||||
} half_4;
|
||||
|
||||
#ifdef ENABLE_BF16
|
||||
typedef struct __CUDA_ALIGN__(4)
|
||||
{
|
||||
__nv_bfloat16 array[2];
|
||||
} __nv_bfloat16_2;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
__nv_bfloat162 x, y;
|
||||
} __nv_bfloat162_2_xy;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
__nv_bfloat16 array[4];
|
||||
} __nv_bfloat164;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
__nv_bfloat162 array[2];
|
||||
} __nv_bfloat162_2;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(16)
|
||||
{
|
||||
__nv_bfloat16 array[8];
|
||||
} __nv_bfloat168;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(16)
|
||||
{
|
||||
__nv_bfloat162 array[4];
|
||||
} __nv_bfloat162_4;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(32)
|
||||
{
|
||||
__nv_bfloat16 array[16];
|
||||
} __nv_bfloat1616;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
typedef struct __CUDA_ALIGN__(2)
|
||||
{
|
||||
__nv_fp8_e4m3 array[2];
|
||||
} __nv_fp8_2_e4m3;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(4)
|
||||
{
|
||||
__nv_fp8_e4m3 array[4];
|
||||
} __nv_fp8_4_e4m3;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(4)
|
||||
{
|
||||
__nv_fp8x2_e4m3 array[2];
|
||||
} __nv_fp8x2_x2_e4m3;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
__nv_fp8_e4m3 array[8];
|
||||
} __nv_fp8_8_e4m3;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(8)
|
||||
{
|
||||
__nv_fp8x2_e4m3 array[4];
|
||||
} __nv_fp8x2_x4_e4m3;
|
||||
|
||||
typedef struct __CUDA_ALIGN__(16)
|
||||
{
|
||||
__nv_fp8_e4m3 array[16];
|
||||
} __nv_fp8x16_e4m3;
|
||||
#endif
|
||||
|
||||
// only BF16 and FP8
|
||||
template <typename T, int PACK_SIZE>
|
||||
struct PackType
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
#ifdef ENABLE_BF16
|
||||
template <>
|
||||
struct PackType<__nv_bfloat16, 2>
|
||||
{
|
||||
using type = __nv_bfloat16_2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackType<__nv_bfloat16, 4>
|
||||
{
|
||||
using type = __nv_bfloat164;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackType<__nv_bfloat16, 8>
|
||||
{
|
||||
using type = __nv_bfloat168;
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
template <>
|
||||
struct PackType<__nv_fp8_e4m3, 2>
|
||||
{
|
||||
using type = __nv_fp8_2_e4m3;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackType<__nv_fp8_e4m3, 4>
|
||||
{
|
||||
using type = __nv_fp8_4_e4m3;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackType<__nv_fp8_e4m3, 8>
|
||||
{
|
||||
using type = __nv_fp8_8_e4m3;
|
||||
};
|
||||
#endif
|
||||
|
||||
__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, __nv_fp8x4_e4m3 const* in)
|
||||
{
|
||||
const char4 tmp_val = reinterpret_cast<char4 const*>(in)[0];
|
||||
*out1 = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
|
||||
(float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
|
||||
*out2 = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.z)[0],
|
||||
(float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.w)[0]);
|
||||
}
|
||||
|
||||
__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(__nv_fp8x2_e4m3 const* in)
|
||||
{
|
||||
const char2 tmp_val = reinterpret_cast<char2 const*>(in)[0];
|
||||
__nv_bfloat162 out = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
|
||||
(float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
|
||||
return out;
|
||||
}
|
||||
|
||||
__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, __nv_fp8x4_e4m3 const* in)
|
||||
{
|
||||
const char4 tmp_val = reinterpret_cast<char4 const*>(in)[0];
|
||||
*out1 = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
|
||||
(float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
|
||||
*out2 = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.z)[0],
|
||||
(float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.w)[0]);
|
||||
}
|
||||
|
||||
__inline__ __device__ half2 fp8x2_e4m3_to_half2(__nv_fp8x2_e4m3 const* in)
|
||||
{
|
||||
const char2 tmp_val = reinterpret_cast<char2 const*>(in)[0];
|
||||
half2 out = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
|
||||
(float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T_OUT, typename T_S, typename T_IN>
|
||||
void invokeQuantizeMatrix(T_OUT* output, T_S const* input_qua_amax_ptr, T_IN const* input, int64_t numel, int64_t lda,
|
||||
QuantizeMode quantize_mode, cudaStream_t stream);
|
||||
|
||||
template <typename T_OUT, typename T_S, typename T_IN>
|
||||
void invokeDequantizeMatrix(T_OUT* output, T_S const* input_qua_amax_ptr, T_IN const* input, int64_t numel, int64_t lda,
|
||||
QuantizeMode quantize_mode, cudaStream_t stream);
|
||||
|
||||
template <typename T_FAKE, typename T_OUT, typename T_IN>
|
||||
void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int64_t numel, cudaStream_t stream);
|
||||
|
||||
template <typename T_S, typename T_W>
|
||||
void invokeComputeFP8QuantizeScale(T_S* quant_ptr, const T_W* weights, const int64_t k, const int64_t lda,
|
||||
QuantizeMode quantize_mode, cudaStream_t stream);
|
||||
|
||||
template <typename T_OUT, typename T_S, typename T_IN>
|
||||
void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T_IN* weights, const int64_t numel,
|
||||
const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
#endif // ENABLE_FP8
|
||||
@@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/cudaProfilerUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexesImpl(
|
||||
std::string const& envVarName)
|
||||
{
|
||||
auto envVarVal = std::getenv(envVarName.c_str());
|
||||
auto envVarValStr = std::string{envVarVal != nullptr ? envVarVal : ""};
|
||||
auto values = tensorrt_llm::common::str2set(envVarValStr, ',');
|
||||
std::unordered_set<int32_t> startSet;
|
||||
std::unordered_set<int32_t> endSet;
|
||||
for (std::string const& value : values)
|
||||
{
|
||||
size_t dashIdx = value.find("-");
|
||||
if (dashIdx != std::string::npos)
|
||||
{
|
||||
int32_t start = std::stoi(value.substr(0, dashIdx));
|
||||
startSet.insert(start);
|
||||
int32_t end = std::stoi(value.substr(dashIdx + 1));
|
||||
endSet.insert(end);
|
||||
}
|
||||
else
|
||||
{
|
||||
int32_t start_end = std::stoi(value);
|
||||
startSet.insert(start_end);
|
||||
endSet.insert(start_end);
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_pair(startSet, endSet);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
|
||||
std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName)
|
||||
{
|
||||
auto [profileIterIdxs, stopIterIdxs] = populateIterationIndexesImpl(envVarName);
|
||||
|
||||
// If empty, try to use legacy env var name
|
||||
if (legacyEnvVarName && profileIterIdxs.empty() && stopIterIdxs.empty())
|
||||
{
|
||||
std::tie(profileIterIdxs, stopIterIdxs) = populateIterationIndexesImpl(legacyEnvVarName.value());
|
||||
|
||||
if (!profileIterIdxs.empty() || !stopIterIdxs.empty())
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Using deprecated environment variable %s to specify cudaProfiler start and stop iterations. "
|
||||
"Please "
|
||||
"use %s "
|
||||
"instead.",
|
||||
legacyEnvVarName.value().c_str(), envVarName.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_pair(profileIterIdxs, stopIterIdxs);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
641
sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
vendored
Normal file
641
sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
vendored
Normal file
@@ -0,0 +1,641 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include "tensorrt_llm/common/cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <cublasLt.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <driver_types.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#ifndef _WIN32 // Linux
|
||||
#include <sys/sysinfo.h>
|
||||
#endif // not WIN32
|
||||
#include <vector>
|
||||
#ifdef _WIN32 // Windows
|
||||
#include <windows.h>
|
||||
#undef ERROR // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without
|
||||
// this undef.
|
||||
#endif // WIN32
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
// workspace for cublas gemm : 32MB
|
||||
#define CUBLAS_WORKSPACE_SIZE 33554432
|
||||
|
||||
typedef struct __align__(4)
|
||||
{
|
||||
half x, y, z, w;
|
||||
}
|
||||
|
||||
half4;
|
||||
|
||||
/* **************************** type definition ***************************** */
|
||||
|
||||
enum CublasDataType
|
||||
{
|
||||
FLOAT_DATATYPE = 0,
|
||||
HALF_DATATYPE = 1,
|
||||
BFLOAT16_DATATYPE = 2,
|
||||
INT8_DATATYPE = 3,
|
||||
FP8_DATATYPE = 4
|
||||
};
|
||||
|
||||
enum TRTLLMCudaDataType
|
||||
{
|
||||
FP32 = 0,
|
||||
FP16 = 1,
|
||||
BF16 = 2,
|
||||
INT8 = 3,
|
||||
FP8 = 4
|
||||
};
|
||||
|
||||
enum class OperationType
|
||||
{
|
||||
FP32,
|
||||
FP16,
|
||||
BF16,
|
||||
INT8,
|
||||
FP8
|
||||
};
|
||||
|
||||
/* **************************** debug tools ********************************* */
|
||||
static char const* _cudaGetErrorEnum(cudaError_t error)
|
||||
{
|
||||
return cudaGetErrorString(error);
|
||||
}
|
||||
|
||||
static char const* _cudaGetErrorEnum(cublasStatus_t error)
|
||||
{
|
||||
switch (error)
|
||||
{
|
||||
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
||||
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
|
||||
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
|
||||
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
|
||||
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
|
||||
case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
return "<unknown>";
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void check(T result, char const* const func, char const* const file, int const line)
|
||||
{
|
||||
if (result)
|
||||
{
|
||||
throw TllmException(
|
||||
file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void checkEx(T result, std::initializer_list<T> const& validReturns, char const* const func, char const* const file,
|
||||
int const line)
|
||||
{
|
||||
if (std::all_of(std::begin(validReturns), std::end(validReturns), [&result](T const& t) { return t != result; }))
|
||||
{
|
||||
throw TllmException(
|
||||
file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result)));
|
||||
}
|
||||
}
|
||||
|
||||
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
|
||||
#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
|
||||
|
||||
inline std::optional<bool> isCudaLaunchBlocking()
|
||||
{
|
||||
static bool firstCall = true;
|
||||
static std::optional<bool> result = std::nullopt;
|
||||
|
||||
if (firstCall)
|
||||
{
|
||||
char const* env = std::getenv("CUDA_LAUNCH_BLOCKING");
|
||||
if (env != nullptr && std::string(env) == "1")
|
||||
{
|
||||
result = true;
|
||||
}
|
||||
else if (env != nullptr && std::string(env) == "0")
|
||||
{
|
||||
result = false;
|
||||
}
|
||||
firstCall = false;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline bool doCheckError()
|
||||
{
|
||||
auto const cudaLaunchBlocking = isCudaLaunchBlocking();
|
||||
#ifndef NDEBUG
|
||||
bool const checkError = cudaLaunchBlocking.value_or(true);
|
||||
#else
|
||||
bool const checkError = cudaLaunchBlocking.value_or(false);
|
||||
#endif
|
||||
|
||||
return checkError;
|
||||
}
|
||||
|
||||
inline void syncAndCheck(char const* const file, int const line)
|
||||
{
|
||||
if (doCheckError())
|
||||
{
|
||||
cudaDeviceSynchronize();
|
||||
check(cudaGetLastError(), "cudaGetLastError", file, line);
|
||||
}
|
||||
}
|
||||
|
||||
#define sync_check_cuda_error() tensorrt_llm::common::syncAndCheck(__FILE__, __LINE__)
|
||||
|
||||
#define PRINT_FUNC_NAME_() \
|
||||
do \
|
||||
{ \
|
||||
std::cout << "[TensorRT-LLM][CALL] " << __FUNCTION__ << " " << std::endl; \
|
||||
} while (0)
|
||||
|
||||
// clang-format off
|
||||
template<typename T> struct packed_type;
|
||||
template <> struct packed_type<float> { using type = float; }; // we don't need to pack float by default
|
||||
template <> struct packed_type<half> { using type = half2; };
|
||||
|
||||
#ifdef ENABLE_BF16
|
||||
template<>
|
||||
struct packed_type<__nv_bfloat16> {
|
||||
using type = __nv_bfloat162;
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
template<>
|
||||
struct packed_type<__nv_fp8_e4m3> {
|
||||
using type = __nv_fp8x2_e4m3;
|
||||
};
|
||||
#endif
|
||||
|
||||
template<typename T> struct num_elems;
|
||||
template <> struct num_elems<float> { static constexpr int value = 1; };
|
||||
template <> struct num_elems<float2> { static constexpr int value = 2; };
|
||||
template <> struct num_elems<float4> { static constexpr int value = 4; };
|
||||
template <> struct num_elems<half> { static constexpr int value = 1; };
|
||||
template <> struct num_elems<half2> { static constexpr int value = 2; };
|
||||
#ifdef ENABLE_BF16
|
||||
template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; };
|
||||
template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; };
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template <> struct num_elems<__nv_fp8_e4m3> { static constexpr int value = 1; };
|
||||
template <> struct num_elems<__nv_fp8x2_e4m3> { static constexpr int value = 2; };
|
||||
#endif
|
||||
|
||||
template<typename T, int num> struct packed_as;
|
||||
template<typename T> struct packed_as<T, 1> { using type = T; };
|
||||
template<> struct packed_as<half, 2> { using type = half2; };
|
||||
template<> struct packed_as<float, 2> { using type = float2; };
|
||||
template<> struct packed_as<int8_t, 2> { using type = int16_t; };
|
||||
template<> struct packed_as<int32_t, 2> { using type = int2; };
|
||||
template<> struct packed_as<half2, 1> { using type = half; };
|
||||
template<> struct packed_as<float2, 1> { using type = float; };
|
||||
#ifdef ENABLE_BF16
|
||||
template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; };
|
||||
template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; };
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template<> struct packed_as<__nv_fp8_e4m3, 2> { using type = __nv_fp8x2_e4m3; };
|
||||
template<> struct packed_as<__nv_fp8x2_e4m3, 1> { using type = __nv_fp8_e4m3; };
|
||||
template<> struct packed_as<__nv_fp8_e5m2, 2> { using type = __nv_fp8x2_e5m2; };
|
||||
template<> struct packed_as<__nv_fp8x2_e5m2, 1> { using type = __nv_fp8_e5m2; };
|
||||
#endif
|
||||
|
||||
inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
|
||||
inline __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); }
|
||||
inline __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
|
||||
|
||||
inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); }
|
||||
inline __device__ float2 operator+(float2 a, float b) { return make_float2(a.x + b, a.y + b); }
|
||||
inline __device__ float2 operator-(float2 a, float b) { return make_float2(a.x - b, a.y - b); }
|
||||
|
||||
// clang-format on
|
||||
|
||||
template <typename T>
|
||||
struct CudaDataType
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CudaDataType<float>
|
||||
{
|
||||
static constexpr cudaDataType_t value = cudaDataType::CUDA_R_32F;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CudaDataType<half>
|
||||
{
|
||||
static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16F;
|
||||
};
|
||||
|
||||
#ifdef ENABLE_BF16
|
||||
template <>
|
||||
struct CudaDataType<__nv_bfloat16>
|
||||
{
|
||||
static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16BF;
|
||||
};
|
||||
#endif
|
||||
|
||||
inline int getSMVersion()
|
||||
{
|
||||
int device{-1};
|
||||
check_cuda_error(cudaGetDevice(&device));
|
||||
int sm_major = 0;
|
||||
int sm_minor = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
|
||||
check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
|
||||
return sm_major * 10 + sm_minor;
|
||||
}
|
||||
|
||||
inline int getDevice()
|
||||
{
|
||||
int current_dev_id = 0;
|
||||
check_cuda_error(cudaGetDevice(¤t_dev_id));
|
||||
return current_dev_id;
|
||||
}
|
||||
|
||||
inline int getDeviceCount()
|
||||
{
|
||||
int count = 0;
|
||||
check_cuda_error(cudaGetDeviceCount(&count));
|
||||
return count;
|
||||
}
|
||||
|
||||
/// @brief Identifies the memory type of the given pointer.
|
||||
template <typename T>
|
||||
cudaMemoryType getPtrCudaMemoryType(T* ptr)
|
||||
{
|
||||
cudaPointerAttributes attributes{};
|
||||
check_cuda_error(cudaPointerGetAttributes(&attributes, ptr));
|
||||
return attributes.type;
|
||||
}
|
||||
|
||||
/// Get the memory info
|
||||
/// \return The free and total amount of memory in bytes
|
||||
inline std::tuple<size_t, size_t> getDeviceMemoryInfo(bool const useUvm)
|
||||
{
|
||||
if (useUvm)
|
||||
{
|
||||
size_t freeSysMem = 0;
|
||||
size_t totalSysMem = 0;
|
||||
#ifndef _WIN32 // Linux
|
||||
struct sysinfo info
|
||||
{
|
||||
};
|
||||
|
||||
sysinfo(&info);
|
||||
totalSysMem = info.totalram * info.mem_unit;
|
||||
freeSysMem = info.freeram * info.mem_unit;
|
||||
#else // Windows
|
||||
MEMORYSTATUSEX memInfo;
|
||||
memInfo.dwLength = sizeof(memInfo);
|
||||
GlobalMemoryStatusEx(&memInfo);
|
||||
totalSysMem = memInfo.ullTotalPhys;
|
||||
freeSysMem = memInfo.ullAvailPhys;
|
||||
#endif // WIN32
|
||||
|
||||
TLLM_LOG_INFO("Using UVM based system memory for KV cache, total memory %0.2f GB, available memory %0.2f GB",
|
||||
((double) totalSysMem / 1e9), ((double) freeSysMem / 1e9));
|
||||
return {freeSysMem, totalSysMem};
|
||||
}
|
||||
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
check_cuda_error(cudaMemGetInfo(&free, &total));
|
||||
TLLM_LOG_DEBUG("Using GPU memory for KV cache, total memory %0.2f GB, available memory %0.2f GB",
|
||||
((double) total / 1e9), ((double) free / 1e9));
|
||||
return {free, total};
|
||||
}
|
||||
|
||||
/// @brief Gets the memory allocation granularity for the current device.
|
||||
///
|
||||
/// @return size_t The size of the smallest difference in memory size supported by the current device.
|
||||
inline size_t getAllocationGranularity()
|
||||
{
|
||||
auto const currentDevice = getDevice();
|
||||
::CUmemAllocationProp prop = {};
|
||||
|
||||
prop.type = ::CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = ::CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.location.id = currentDevice;
|
||||
prop.requestedHandleTypes = ::CU_MEM_HANDLE_TYPE_NONE;
|
||||
|
||||
// Get the minimum granularity supported for allocation with cuMemCreate()
|
||||
size_t granularity = 0;
|
||||
TLLM_CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
return granularity;
|
||||
}
|
||||
|
||||
inline int getMultiProcessorCount()
|
||||
{
|
||||
int device_id = 0;
|
||||
int multi_processor_count = 0;
|
||||
check_cuda_error(cudaGetDevice(&device_id));
|
||||
check_cuda_error(cudaDeviceGetAttribute(&multi_processor_count, cudaDevAttrMultiProcessorCount, device_id));
|
||||
return multi_processor_count;
|
||||
}
|
||||
|
||||
inline int getMaxSharedMemoryPerBlockOptin()
|
||||
{
|
||||
int device_id = 0;
|
||||
int max_shared_memory_per_block = 0;
|
||||
check_cuda_error(cudaGetDevice(&device_id));
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&max_shared_memory_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id));
|
||||
return max_shared_memory_per_block;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
inline size_t divUp(const T1& a, const T2& n)
|
||||
{
|
||||
auto const tmp_a = static_cast<size_t>(a);
|
||||
auto const tmp_n = static_cast<size_t>(n);
|
||||
return (tmp_a + tmp_n - 1) / tmp_n;
|
||||
}
|
||||
|
||||
inline int roundUp(int a, int n)
|
||||
{
|
||||
return divUp(a, n) * n;
|
||||
}
|
||||
|
||||
template <typename T, typename U, typename = std::enable_if_t<std::is_integral<T>::value>,
|
||||
typename = std::enable_if_t<std::is_integral<U>::value>>
|
||||
auto constexpr ceilDiv(T numerator, U denominator)
|
||||
{
|
||||
return (numerator + denominator - 1) / denominator;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void printAbsMean(T const* buf, uint64_t size, cudaStream_t stream, std::string name = "")
|
||||
{
|
||||
if (buf == nullptr)
|
||||
{
|
||||
TLLM_LOG_WARNING("%s is an nullptr, skip!", name.c_str());
|
||||
return;
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
check_cuda_error(cudaGetLastError());
|
||||
T* h_tmp = new T[size];
|
||||
cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);
|
||||
cudaDeviceSynchronize();
|
||||
check_cuda_error(cudaGetLastError());
|
||||
double sum = 0.0f;
|
||||
uint64_t zero_count = 0;
|
||||
float max_val = -1e10;
|
||||
bool find_inf = false;
|
||||
for (uint64_t i = 0; i < size; i++)
|
||||
{
|
||||
if (std::isinf((float) (h_tmp[i])))
|
||||
{
|
||||
find_inf = true;
|
||||
continue;
|
||||
}
|
||||
sum += abs((double) h_tmp[i]);
|
||||
if ((float) h_tmp[i] == 0.0f)
|
||||
{
|
||||
zero_count++;
|
||||
}
|
||||
max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));
|
||||
}
|
||||
TLLM_LOG_INFO("%20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s", name.c_str(), size, sum / size,
|
||||
sum, max_val, find_inf ? "true" : "false");
|
||||
delete[] h_tmp;
|
||||
cudaDeviceSynchronize();
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void printToStream(T const* result, int const size, FILE* strm)
|
||||
{
|
||||
bool const split_rows = (strm == stdout);
|
||||
if (result == nullptr)
|
||||
{
|
||||
TLLM_LOG_WARNING("It is an nullptr, skip! \n");
|
||||
return;
|
||||
}
|
||||
T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
|
||||
check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
|
||||
for (int i = 0; i < size; ++i)
|
||||
{
|
||||
fprintf(strm, "%f, ", static_cast<float>(tmp[i]));
|
||||
if (split_rows && ((i + 1) % 10) == 0)
|
||||
fprintf(strm, "\n");
|
||||
}
|
||||
if (!split_rows || (size % 10) != 0)
|
||||
{
|
||||
fprintf(strm, "\n");
|
||||
}
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void printToScreen(T const* result, int const size)
|
||||
{
|
||||
printToStream(result, size, stdout);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print2dToStream(T const* result, int const r, int const c, int const stride, FILE* strm)
|
||||
{
|
||||
if (result == nullptr)
|
||||
{
|
||||
TLLM_LOG_WARNING("It is an nullptr, skip! \n");
|
||||
return;
|
||||
}
|
||||
for (int ri = 0; ri < r; ++ri)
|
||||
{
|
||||
T const* ptr = result + ri * stride;
|
||||
printToStream(ptr, c, strm);
|
||||
}
|
||||
fprintf(strm, "\n");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print2dToScreen(T const* result, int const r, int const c, int const stride)
|
||||
{
|
||||
print2dToStream(result, r, c, stride, stdout);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print2dToFile(std::string fname, T const* result, int const r, int const c, int const stride)
|
||||
{
|
||||
FILE* fp = fopen(fname.c_str(), "wt");
|
||||
if (fp != nullptr)
|
||||
{
|
||||
print2dToStream(result, r, c, stride, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
inline void print_float_(float x)
|
||||
{
|
||||
printf("%7.3f ", x);
|
||||
}
|
||||
|
||||
inline void print_element_(float x)
|
||||
{
|
||||
print_float_(x);
|
||||
}
|
||||
|
||||
inline void print_element_(half x)
|
||||
{
|
||||
print_float_((float) x);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_BF16
|
||||
inline void print_element_(__nv_bfloat16 x)
|
||||
{
|
||||
print_float_((float) x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
inline void print_element_(__nv_fp8_e4m3 x)
|
||||
{
|
||||
print_float_((float) x);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void print_element_(uint32_t ul)
|
||||
{
|
||||
printf("%7" PRIu32, ul);
|
||||
}
|
||||
|
||||
inline void print_element_(uint64_t ull)
|
||||
{
|
||||
printf("%7" PRIu64, ull);
|
||||
}
|
||||
|
||||
inline void print_element_(int32_t il)
|
||||
{
|
||||
printf("%7" PRId32, il);
|
||||
}
|
||||
|
||||
inline void print_element_(int64_t ill)
|
||||
{
|
||||
printf("%7" PRId64, ill);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void printMatrix(T const* ptr, int m, int k, int stride, bool is_device_ptr)
|
||||
{
|
||||
T* tmp;
|
||||
if (is_device_ptr)
|
||||
{
|
||||
// k < stride ; stride = col-dimension.
|
||||
tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
|
||||
check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp = const_cast<T*>(ptr);
|
||||
}
|
||||
|
||||
for (int ii = -1; ii < m; ++ii)
|
||||
{
|
||||
if (ii >= 0)
|
||||
{
|
||||
printf("%07d ", ii);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(" ");
|
||||
}
|
||||
|
||||
for (int jj = 0; jj < k; jj += 1)
|
||||
{
|
||||
if (ii >= 0)
|
||||
{
|
||||
print_element_(tmp[ii * stride + jj]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%7d ", jj);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
if (is_device_ptr)
|
||||
{
|
||||
free(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template void printMatrix(float const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
template void printMatrix(half const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
#ifdef ENABLE_BF16
|
||||
template void printMatrix(__nv_bfloat16 const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template void printMatrix(__nv_fp8_e4m3 const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
#endif
|
||||
template void printMatrix(uint32_t const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
template void printMatrix(uint64_t const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
template void printMatrix(int const* ptr, int m, int k, int stride, bool is_device_ptr);
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
/*
|
||||
* Macros compliant with TensorRT coding conventions
|
||||
*/
|
||||
#define TLLM_CUDA_CHECK(stat) \
|
||||
do \
|
||||
{ \
|
||||
tensorrt_llm::common::check((stat), #stat, __FILE__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
// We use singleton memory pool and the order of destructors depends on the compiler implementation. We find that the
|
||||
// cudaFree/cudaFreeHost is called after cudaruntime destruction on Windows. There will be an cudaErrorCudartUnloading
|
||||
// error. However, it is safe to ignore this error because the cuda runtime is already exited, we are no more worried
|
||||
// about the memory leaks.
|
||||
#define TLLM_CUDA_CHECK_FREE_RESOURCE(stat) \
|
||||
do \
|
||||
{ \
|
||||
tensorrt_llm::common::checkEx((stat), {cudaSuccess, cudaErrorCudartUnloading}, #stat, __FILE__, __LINE__); \
|
||||
} while (0)
|
||||
@@ -1,36 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace tensorrt_llm::utils::customAllReduceUtils
|
||||
{
|
||||
|
||||
constexpr size_t NUM_POINTERS_PER_RANK = 7;
|
||||
|
||||
// WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py
|
||||
inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
|
||||
{
|
||||
if (worldSize <= 2)
|
||||
{
|
||||
return 16 * 1000 * 1000;
|
||||
}
|
||||
return 8 * 1000 * 1000;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::utils::customAllReduceUtils
|
||||
214
sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp
vendored
214
sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp
vendored
@@ -1,214 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "envUtils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <cstdlib>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
std::optional<int32_t> getIntEnv(char const* name)
|
||||
{
|
||||
char const* const env = std::getenv(name);
|
||||
if (env == nullptr)
|
||||
{
|
||||
return std::nullopt;
|
||||
}
|
||||
int32_t const val = std::stoi(env);
|
||||
if (val <= 0)
|
||||
{
|
||||
return std::nullopt;
|
||||
}
|
||||
return {val};
|
||||
};
|
||||
|
||||
// Returns true if the env variable exists and is set to "1"
|
||||
static bool getBoolEnv(char const* name)
|
||||
{
|
||||
char const* env = std::getenv(name);
|
||||
return env && env[0] == '1' && env[1] == '\0';
|
||||
}
|
||||
|
||||
// XQA kernels (optimized kernels for generation phase).
|
||||
bool forceXQAKernels()
|
||||
{
|
||||
static bool const forceXQA = (getIntEnv("TRTLLM_FORCE_XQA").value_or(0) != 0);
|
||||
return forceXQA;
|
||||
}
|
||||
|
||||
std::optional<bool> getEnvEnableXQAJIT()
|
||||
{
|
||||
static bool init = false;
|
||||
static bool exists = false;
|
||||
static bool enableXQAJIT = false;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
char const* enable_xqa_jit_var = std::getenv("TRTLLM_ENABLE_XQA_JIT");
|
||||
if (enable_xqa_jit_var)
|
||||
{
|
||||
exists = true;
|
||||
if (enable_xqa_jit_var[0] == '1' && enable_xqa_jit_var[1] == '\0')
|
||||
{
|
||||
enableXQAJIT = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (exists)
|
||||
{
|
||||
return enableXQAJIT;
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
// Tune the number of blocks per sequence for accuracy/performance purpose.
|
||||
bool getEnvMmhaMultiblockDebug()
|
||||
{
|
||||
static bool init = false;
|
||||
static bool forceMmhaMaxSeqLenTile = false;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
char const* enable_mmha_debug_var = std::getenv("TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG");
|
||||
if (enable_mmha_debug_var)
|
||||
{
|
||||
if (enable_mmha_debug_var[0] == '1' && enable_mmha_debug_var[1] == '\0')
|
||||
{
|
||||
forceMmhaMaxSeqLenTile = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return forceMmhaMaxSeqLenTile;
|
||||
}
|
||||
|
||||
int getEnvMmhaBlocksPerSequence()
|
||||
{
|
||||
static bool init = false;
|
||||
static int mmhaBlocksPerSequence = 0;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
char const* mmhaBlocksPerSequenceEnv = std::getenv("TRTLLM_MMHA_BLOCKS_PER_SEQUENCE");
|
||||
if (mmhaBlocksPerSequenceEnv)
|
||||
{
|
||||
mmhaBlocksPerSequence = std::atoi(mmhaBlocksPerSequenceEnv);
|
||||
if (mmhaBlocksPerSequence <= 0)
|
||||
{
|
||||
TLLM_LOG_WARNING("Invalid value for TRTLLM_MMHA_BLOCKS_PER_SEQUENCE. Will use default values instead!");
|
||||
}
|
||||
}
|
||||
}
|
||||
return mmhaBlocksPerSequence;
|
||||
}
|
||||
|
||||
int getEnvMmhaKernelBlockSize()
|
||||
{
|
||||
static bool init = false;
|
||||
static int mmhaKernelBlockSize = 0;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
char const* mmhaKernelBlockSizeEnv = std::getenv("TRTLLM_MMHA_KERNEL_BLOCK_SIZE");
|
||||
if (mmhaKernelBlockSizeEnv)
|
||||
{
|
||||
mmhaKernelBlockSize = std::atoi(mmhaKernelBlockSizeEnv);
|
||||
if (mmhaKernelBlockSize <= 0)
|
||||
{
|
||||
TLLM_LOG_WARNING("Invalid value for TRTLLM_MMHA_KERNEL_BLOCK_SIZE. Will use default values instead!");
|
||||
}
|
||||
}
|
||||
}
|
||||
return mmhaKernelBlockSize;
|
||||
}
|
||||
|
||||
bool getEnvEnablePDL()
|
||||
{
|
||||
static bool init = false;
|
||||
static bool enablePDL = false;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
// PDL only available when arch >= 90
|
||||
if (getSMVersion() >= 90)
|
||||
{
|
||||
// PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
|
||||
enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
|
||||
}
|
||||
}
|
||||
return enablePDL;
|
||||
}
|
||||
|
||||
bool getEnvUseUCXKvCache()
|
||||
{
|
||||
static bool const useUCXKVCache = getBoolEnv("TRTLLM_USE_UCX_KVCACHE");
|
||||
return useUCXKVCache;
|
||||
}
|
||||
|
||||
std::string getEnvUCXInterface()
|
||||
{
|
||||
static bool init = false;
|
||||
static std::string ucxInterface;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
{
|
||||
char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE");
|
||||
if (ucx_interface)
|
||||
{
|
||||
ucxInterface = ucx_interface;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ucxInterface;
|
||||
}
|
||||
|
||||
bool getEnvDisaggLayerwise()
|
||||
{
|
||||
static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE");
|
||||
return disaggLayerwise;
|
||||
}
|
||||
|
||||
bool getEnvParallelCacheSend()
|
||||
{
|
||||
static bool const parallelCacheSend = getBoolEnv("TRTLLM_PARALLEL_CACHE_SEND");
|
||||
return parallelCacheSend;
|
||||
}
|
||||
|
||||
bool getEnvRequestKVCacheSerial()
|
||||
{
|
||||
static bool const requestKVCacheSerial = getBoolEnv("TRTLLM_REQUEST_KV_CACHE_SERIAL");
|
||||
return requestKVCacheSerial;
|
||||
}
|
||||
|
||||
bool getEnvDisableKVCacheTransferOverlap()
|
||||
{
|
||||
static bool const disableKVCacheTransferOverlap = getBoolEnv("TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP");
|
||||
return disableKVCacheTransferOverlap;
|
||||
}
|
||||
|
||||
bool getEnvDisableReceiveKVCacheParallel()
|
||||
{
|
||||
static bool const disableReceiveParallel = getBoolEnv("TRTLLM_DISABLE_KVCACHE_RECEIVE_PARALLEL");
|
||||
return disableReceiveParallel;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
@@ -1,60 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
// Useful when you want to inject some debug code controllable with env var.
|
||||
std::optional<int32_t> getIntEnv(char const* name);
|
||||
|
||||
// XQA kernels (optimized kernels for generation phase).
|
||||
bool forceXQAKernels();
|
||||
|
||||
// Whether XQA JIT is enabled.
|
||||
//
|
||||
// Returns the value of TRTLLM_ENABLE_XQA_JIT env var. If such env var doesn't exist, std::nullopt is returned.
|
||||
std::optional<bool> getEnvEnableXQAJIT();
|
||||
|
||||
// Tune the number of blocks per sequence for accuracy/performance purpose.
|
||||
bool getEnvMmhaMultiblockDebug();
|
||||
|
||||
int getEnvMmhaBlocksPerSequence();
|
||||
|
||||
int getEnvMmhaKernelBlockSize();
|
||||
|
||||
// Whether PDL is enabled.
|
||||
bool getEnvEnablePDL();
|
||||
|
||||
bool getEnvUseUCXKvCache();
|
||||
|
||||
std::string getEnvUCXInterface();
|
||||
|
||||
bool getEnvDisaggLayerwise();
|
||||
|
||||
bool getEnvParallelCacheSend();
|
||||
|
||||
bool getEnvRequestKVCacheSerial();
|
||||
|
||||
bool getEnvDisableKVCacheTransferOverlap();
|
||||
|
||||
bool getEnvDisableReceiveKVCacheParallel();
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
190
sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
vendored
Normal file
190
sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
vendored
Normal file
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
class Logger
|
||||
{
|
||||
|
||||
// On Windows, the file wingdi.h is included which has
|
||||
// #define ERROR 0
|
||||
// This breaks everywhere ERROR is used in the Level enum
|
||||
#ifdef _WIN32
|
||||
#undef ERROR
|
||||
#endif // _WIN32
|
||||
|
||||
public:
|
||||
enum Level
|
||||
{
|
||||
TRACE = 0,
|
||||
DEBUG = 10,
|
||||
INFO = 20,
|
||||
WARNING = 30,
|
||||
ERROR = 40
|
||||
};
|
||||
|
||||
static Logger* getLogger();
|
||||
|
||||
Logger(Logger const&) = delete;
|
||||
void operator=(Logger const&) = delete;
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
template <typename... Args>
|
||||
void log(Level level, char const* format, Args const&... args);
|
||||
|
||||
template <typename... Args>
|
||||
void log(Level level, int rank, char const* format, Args const&... args);
|
||||
#else
|
||||
template <typename... Args>
|
||||
void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
|
||||
|
||||
template <typename... Args>
|
||||
void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0)));
|
||||
#endif
|
||||
|
||||
template <typename... Args>
|
||||
void log(Level level, std::string const& format, Args const&... args)
|
||||
{
|
||||
return log(level, format.c_str(), args...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void log(Level const level, int const rank, std::string const& format, Args const&... args)
|
||||
{
|
||||
return log(level, rank, format.c_str(), args...);
|
||||
}
|
||||
|
||||
void log(std::exception const& ex, Level level = Level::ERROR);
|
||||
|
||||
Level getLevel() const
|
||||
{
|
||||
return level_;
|
||||
}
|
||||
|
||||
void setLevel(Level const level)
|
||||
{
|
||||
level_ = level;
|
||||
log(INFO, "Set logger level to %s", getLevelName(level));
|
||||
}
|
||||
|
||||
bool isEnabled(Level const level) const
|
||||
{
|
||||
return level_ <= level;
|
||||
}
|
||||
|
||||
private:
|
||||
static auto constexpr kPREFIX = "[TensorRT-LLM]";
|
||||
|
||||
#ifndef NDEBUG
|
||||
Level const DEFAULT_LOG_LEVEL = DEBUG;
|
||||
#else
|
||||
Level const DEFAULT_LOG_LEVEL = INFO;
|
||||
#endif
|
||||
Level level_ = DEFAULT_LOG_LEVEL;
|
||||
|
||||
Logger(); // NOLINT(modernize-use-equals-delete)
|
||||
|
||||
static inline char const* getLevelName(Level const level)
|
||||
{
|
||||
switch (level)
|
||||
{
|
||||
case TRACE: return "TRACE";
|
||||
case DEBUG: return "DEBUG";
|
||||
case INFO: return "INFO";
|
||||
case WARNING: return "WARNING";
|
||||
case ERROR: return "ERROR";
|
||||
}
|
||||
|
||||
TLLM_THROW("Unknown log level: %d", level);
|
||||
}
|
||||
|
||||
static inline std::string getPrefix(Level const level)
|
||||
{
|
||||
return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
|
||||
}
|
||||
|
||||
static inline std::string getPrefix(Level const level, int const rank)
|
||||
{
|
||||
return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename... Args>
|
||||
void Logger::log(Logger::Level level, char const* format, Args const&... args)
|
||||
{
|
||||
if (isEnabled(level))
|
||||
{
|
||||
auto const fmt = getPrefix(level) + format;
|
||||
auto& out = level_ < WARNING ? std::cout : std::cerr;
|
||||
if constexpr (sizeof...(args) > 0)
|
||||
{
|
||||
out << fmtstr(fmt.c_str(), args...);
|
||||
}
|
||||
else
|
||||
{
|
||||
out << fmt;
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void Logger::log(Logger::Level const level, int const rank, char const* format, Args const&... args)
|
||||
{
|
||||
if (isEnabled(level))
|
||||
{
|
||||
auto const fmt = getPrefix(level, rank) + format;
|
||||
auto& out = level_ < WARNING ? std::cout : std::cerr;
|
||||
if constexpr (sizeof...(args) > 0)
|
||||
{
|
||||
out << fmtstr(fmt.c_str(), args...);
|
||||
}
|
||||
else
|
||||
{
|
||||
out << fmt;
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#define TLLM_LOG(level, ...) \
|
||||
do \
|
||||
{ \
|
||||
auto* const logger = tensorrt_llm::common::Logger::getLogger(); \
|
||||
if (logger->isEnabled(level)) \
|
||||
{ \
|
||||
logger->log(level, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_LOG_TRACE(...) TLLM_LOG(tensorrt_llm::common::Logger::TRACE, __VA_ARGS__)
|
||||
#define TLLM_LOG_DEBUG(...) TLLM_LOG(tensorrt_llm::common::Logger::DEBUG, __VA_ARGS__)
|
||||
#define TLLM_LOG_INFO(...) TLLM_LOG(tensorrt_llm::common::Logger::INFO, __VA_ARGS__)
|
||||
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
|
||||
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
|
||||
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
|
||||
} // namespace tensorrt_llm::common
|
||||
@@ -1,37 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
inline __device__ __host__ T divUp(T m, T n)
|
||||
{
|
||||
return (m + n - 1) / n;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
@@ -1,906 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/memoryUtils.h"
|
||||
|
||||
#include <curand_kernel.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
template <typename T>
|
||||
void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
|
||||
{
|
||||
check_cuda_error(cudaMalloc((void**) (ptr), sizeof(T) * size));
|
||||
if (is_random_initialize)
|
||||
{
|
||||
cudaRandomUniform(*ptr, size);
|
||||
}
|
||||
}
|
||||
|
||||
template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
|
||||
template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
|
||||
#ifdef ENABLE_BF16
|
||||
template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
|
||||
#endif
|
||||
template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
|
||||
template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
|
||||
template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
|
||||
template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
|
||||
template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
|
||||
#ifdef ENABLE_FP8
|
||||
template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void deviceMemSetZero(T* ptr, size_t size)
|
||||
{
|
||||
check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
|
||||
}
|
||||
|
||||
template void deviceMemSetZero(float* ptr, size_t size);
|
||||
template void deviceMemSetZero(half* ptr, size_t size);
|
||||
template void deviceMemSetZero(int* ptr, size_t size);
|
||||
template void deviceMemSetZero(uint32_t* ptr, size_t size);
|
||||
template void deviceMemSetZero(bool* ptr, size_t size);
|
||||
#ifdef ENABLE_FP8
|
||||
template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
|
||||
#endif
|
||||
#ifdef ENABLE_BF16
|
||||
template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void deviceFree(T*& ptr)
|
||||
{
|
||||
if (ptr != NULL)
|
||||
{
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
ptr = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
template void deviceFree(float*& ptr);
|
||||
template void deviceFree(half*& ptr);
|
||||
#ifdef ENABLE_BF16
|
||||
template void deviceFree(__nv_bfloat16*& ptr);
|
||||
#endif
|
||||
template void deviceFree(unsigned short*& ptr);
|
||||
template void deviceFree(int*& ptr);
|
||||
template void deviceFree(bool*& ptr);
|
||||
template void deviceFree(char*& ptr);
|
||||
template void deviceFree(int8_t*& ptr);
|
||||
#ifdef ENABLE_FP8
|
||||
template void deviceFree(__nv_fp8_e4m3*& ptr);
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
|
||||
{
|
||||
T* arr = new T[size];
|
||||
std::fill(arr, arr + size, value);
|
||||
check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
|
||||
delete[] arr;
|
||||
}
|
||||
|
||||
template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
|
||||
template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream);
|
||||
#endif
|
||||
template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream);
|
||||
template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void cudaD2Hcpy(T* tgt, T const* src, const size_t size)
|
||||
{
|
||||
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
template void cudaD2Hcpy(float* tgt, float const* src, size_t size);
|
||||
template void cudaD2Hcpy(half* tgt, half const* src, size_t size);
|
||||
#ifdef ENABLE_BF16
|
||||
template void cudaD2Hcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size);
|
||||
#endif
|
||||
template void cudaD2Hcpy(int* tgt, int const* src, size_t size);
|
||||
template void cudaD2Hcpy(bool* tgt, bool const* src, size_t size);
|
||||
#ifdef ENABLE_FP8
|
||||
template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size);
|
||||
#endif
|
||||
template void cudaD2Hcpy(unsigned long long* tgt, unsigned long long const* src, size_t size);
|
||||
template void cudaD2Hcpy(unsigned int* tgt, unsigned int const* src, size_t size);
|
||||
template void cudaD2Hcpy(int8_t* tgt, int8_t const* src, size_t size);
|
||||
|
||||
template <typename T>
|
||||
void cudaH2Dcpy(T* tgt, T const* src, const size_t size)
|
||||
{
|
||||
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template void cudaH2Dcpy(float* tgt, float const* src, size_t size);
|
||||
template void cudaH2Dcpy(half* tgt, half const* src, size_t size);
|
||||
#ifdef ENABLE_BF16
|
||||
template void cudaH2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size);
|
||||
#endif
|
||||
template void cudaH2Dcpy(int* tgt, int const* src, size_t size);
|
||||
template void cudaH2Dcpy(bool* tgt, bool const* src, size_t size);
|
||||
#ifdef ENABLE_FP8
|
||||
template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size);
|
||||
#endif
|
||||
template void cudaH2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size);
|
||||
template void cudaH2Dcpy(unsigned int* tgt, unsigned int const* src, size_t size);
|
||||
template void cudaH2Dcpy(int8_t* tgt, int8_t const* src, size_t size);
|
||||
|
||||
template <typename T>
|
||||
void cudaD2Dcpy(T* tgt, T const* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
template void cudaD2Dcpy(float* tgt, float const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaD2Dcpy(half* tgt, half const* src, size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void cudaD2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
template void cudaD2Dcpy(int* tgt, int const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaD2Dcpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaD2Dcpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_FP8
|
||||
template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
template void cudaD2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream);
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = (T_OUT) ((float) (src[tid]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
|
||||
}
|
||||
|
||||
template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(
|
||||
__nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(
|
||||
__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void cudaAutoCpy(T* tgt, T const* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
if (stream != NULL)
|
||||
{
|
||||
check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream));
|
||||
}
|
||||
else
|
||||
{
|
||||
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault));
|
||||
}
|
||||
}
|
||||
|
||||
template void cudaAutoCpy(float* tgt, float const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(half* tgt, half const* src, size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void cudaAutoCpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
template void cudaAutoCpy(int* tgt, int const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(uint8_t* tgt, uint8_t const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(uint32_t* tgt, uint32_t const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(unsigned long* tgt, unsigned long const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(char* tgt, char const* src, size_t size, cudaStream_t stream);
|
||||
|
||||
template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream);
|
||||
template void cudaAutoCpy(
|
||||
unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, int const seq_offset)
|
||||
{
|
||||
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState_t local_state;
|
||||
curand_init((unsigned long long int) 1337, idx + seq_offset, 0, &local_state);
|
||||
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
||||
{
|
||||
buffer[index] = (T) (curand_uniform(&local_state) * 0.2f - 0.1f);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ void cuda_random_uniform_kernel<int>(int* buffer, const size_t size, int const seq_offset)
|
||||
{
|
||||
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState_t local_state;
|
||||
curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
|
||||
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
||||
{
|
||||
buffer[index] = curand(&local_state);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ void cuda_random_uniform_kernel<bool>(bool* buffer, const size_t size, int const seq_offset)
|
||||
{
|
||||
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState_t local_state;
|
||||
curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
|
||||
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
||||
{
|
||||
buffer[index] = (curand(&local_state) % 2 == 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size, int const seq_offset)
|
||||
{
|
||||
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState_t local_state;
|
||||
curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
|
||||
for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
|
||||
{
|
||||
buffer[index] = curand(&local_state) % 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void cudaRandomUniform(T* buffer, const size_t size)
|
||||
{
|
||||
static int seq_offset = 0;
|
||||
cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
|
||||
seq_offset += 256 * 256;
|
||||
}
|
||||
|
||||
template void cudaRandomUniform(float* buffer, const size_t size);
|
||||
template void cudaRandomUniform(half* buffer, const size_t size);
|
||||
#ifdef ENABLE_BF16
|
||||
template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
|
||||
#endif
|
||||
template void cudaRandomUniform(int* buffer, const size_t size);
|
||||
template void cudaRandomUniform(bool* buffer, const size_t size);
|
||||
template void cudaRandomUniform(char* buffer, const size_t size);
|
||||
#ifdef ENABLE_FP8
|
||||
template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
|
||||
#endif
|
||||
|
||||
// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
|
||||
// the product of the elements in shape is 0, this function will return an empty vector.
|
||||
template <typename T>
|
||||
std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
|
||||
{
|
||||
if (shape.size() > 2)
|
||||
{
|
||||
printf("[ERROR] shape should have less than two dims \n");
|
||||
return std::vector<T>();
|
||||
}
|
||||
size_t dim0 = shape[0], dim1 = 1;
|
||||
if (shape.size() == 2)
|
||||
{
|
||||
dim1 = shape[1];
|
||||
}
|
||||
size_t size = dim0 * dim1;
|
||||
if (size == 0)
|
||||
{
|
||||
TLLM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
|
||||
return std::vector<T>();
|
||||
}
|
||||
|
||||
std::vector<T> host_array(size);
|
||||
std::ifstream in(filename, std::ios::in | std::ios::binary);
|
||||
if (!in.is_open())
|
||||
{
|
||||
TLLM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
|
||||
return std::vector<T>();
|
||||
}
|
||||
|
||||
size_t loaded_data_size = sizeof(T) * size;
|
||||
in.seekg(0, in.end);
|
||||
in.seekg(0, in.beg);
|
||||
|
||||
TLLM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
|
||||
in.read((char*) host_array.data(), loaded_data_size);
|
||||
|
||||
size_t in_get_size = in.gcount();
|
||||
if (in_get_size != loaded_data_size)
|
||||
{
|
||||
TLLM_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n", filename.c_str(),
|
||||
in_get_size, loaded_data_size);
|
||||
return std::vector<T>();
|
||||
}
|
||||
in.close();
|
||||
// If we succeed, return an array with values.
|
||||
return host_array;
|
||||
}
|
||||
|
||||
template <typename T, typename T_IN>
|
||||
int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filename)
|
||||
{
|
||||
std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename);
|
||||
|
||||
if (host_array.empty())
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (std::is_same<T, T_IN>::value == true)
|
||||
{
|
||||
cudaH2Dcpy(ptr, (T*) host_array.data(), host_array.size());
|
||||
}
|
||||
else
|
||||
{
|
||||
T_IN* ptr_2 = nullptr;
|
||||
deviceMalloc(&ptr_2, host_array.size(), false);
|
||||
cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
|
||||
invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
|
||||
deviceFree(ptr_2);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template int loadWeightFromBinFunc<float, float>(float* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<half, float>(half* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<float, half>(float* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<half, half>(half* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr, std::vector<size_t> shape, std::string filename);
|
||||
#ifdef ENABLE_BF16
|
||||
template int loadWeightFromBinFunc<__nv_bfloat16, float>(
|
||||
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<__nv_bfloat16, half>(
|
||||
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr, std::vector<size_t> shape, std::string filename);
|
||||
template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(
|
||||
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
|
||||
#endif // ENABLE_BF16
|
||||
template int loadWeightFromBinFunc<int, int>(int* ptr, std::vector<size_t> shape, std::string filename);
|
||||
#ifdef ENABLE_FP8
|
||||
template int loadWeightFromBinFunc<__nv_fp8_e4m3, float>(
|
||||
__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename);
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
template <typename T>
|
||||
int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type)
|
||||
{
|
||||
switch (model_file_type)
|
||||
{
|
||||
case TRTLLMCudaDataType::FP32: loadWeightFromBinFunc<T, float>(ptr, shape, filename); break;
|
||||
case TRTLLMCudaDataType::FP16: loadWeightFromBinFunc<T, half>(ptr, shape, filename); break;
|
||||
case TRTLLMCudaDataType::INT8: loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename); break;
|
||||
#ifdef ENABLE_BF16
|
||||
case TRTLLMCudaDataType::BF16: loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename); break;
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
case TRTLLMCudaDataType::FP8: loadWeightFromBinFunc<T, float>(ptr, shape, filename); break;
|
||||
#endif
|
||||
default: TLLM_LOG_ERROR("Does not support TRTLLMCudaDataType=%d", model_file_type); TLLM_CHECK(false);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
int loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type)
|
||||
{
|
||||
loadWeightFromBinFunc<int, int>(ptr, shape, filename);
|
||||
return 0;
|
||||
}
|
||||
|
||||
template int loadWeightFromBin(
|
||||
float* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
||||
template int loadWeightFromBin(
|
||||
half* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
||||
template int loadWeightFromBin(
|
||||
int8_t* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
||||
#ifdef ENABLE_BF16
|
||||
template int loadWeightFromBin(
|
||||
__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template int loadWeightFromBin(
|
||||
__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
||||
#endif
|
||||
template int loadWeightFromBin(
|
||||
int* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
|
||||
|
||||
template <typename T_IN, typename T_OUT>
|
||||
__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = cuda_cast<T_OUT>(src[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T_IN, typename T_OUT>
|
||||
void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size);
|
||||
}
|
||||
|
||||
template void invokeCudaD2DcpyConvert(int8_t* tgt, float const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(float* tgt, int8_t const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(float* tgt, int const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(half* tgt, int const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(float* tgt, float const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(half* tgt, float const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(float* tgt, half const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(uint32_t* tgt, int const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(int* tgt, uint32_t const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(int* tgt, float const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(int* tgt, half const* src, const size_t size, cudaStream_t stream);
|
||||
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, float const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, int const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(float* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DcpyConvert(int* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream);
|
||||
#endif // ENABLE_BF16
|
||||
|
||||
template <typename T_IN, typename T_OUT>
|
||||
__global__ void cudaD2DScaleCpyConvert(
|
||||
T_OUT* dst, const T_IN* src, float const* scale, bool invert_scale, const size_t size)
|
||||
{
|
||||
float const scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T_IN, typename T_OUT>
|
||||
void invokeCudaD2DScaleCpyConvert(
|
||||
T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
#endif // ENABLE_BF16
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
|
||||
#endif // ENABLE_FP8
|
||||
// clang-format on
|
||||
|
||||
void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
invokeCudaD2DcpyConvert(dst, src, size, stream);
|
||||
}
|
||||
|
||||
void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
invokeCudaD2DcpyConvert(dst, src, size, stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void saveToBinary(T const* ptr, const size_t size, std::string filename)
|
||||
{
|
||||
|
||||
std::vector<T> h_ptr(size);
|
||||
cudaD2Hcpy(h_ptr.data(), ptr, size);
|
||||
std::vector<float> float_ptr(size);
|
||||
for (size_t i = 0; i < size; i++)
|
||||
{
|
||||
float_ptr[i] = (float) h_ptr[i];
|
||||
}
|
||||
|
||||
std::ofstream out(filename, std::ios::out | std::ios::binary);
|
||||
TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
|
||||
|
||||
out.write((char*) float_ptr.data(), size * sizeof(float));
|
||||
}
|
||||
|
||||
template void saveToBinary(float const* ptr, const size_t size, std::string filename);
|
||||
template void saveToBinary(half const* ptr, const size_t size, std::string filename);
|
||||
#ifdef ENABLE_BF16
|
||||
template void saveToBinary(__nv_bfloat16 const* ptr, const size_t size, std::string filename);
|
||||
#endif // ENABLE_BF16
|
||||
|
||||
template <>
|
||||
void saveToBinary(int const* ptr, const size_t size, std::string filename)
|
||||
{
|
||||
std::vector<int> h_ptr(size);
|
||||
cudaD2Hcpy(h_ptr.data(), ptr, size);
|
||||
std::ofstream out(filename, std::ios::out | std::ios::binary);
|
||||
TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
|
||||
out.write((char*) h_ptr.data(), size * sizeof(int));
|
||||
}
|
||||
|
||||
template <typename T_IN, typename T_fake_type>
|
||||
__global__ void fakeCast(T_IN* input_ptr, const size_t size)
|
||||
{
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
|
||||
{
|
||||
T_fake_type tmp_val = (T_fake_type) ((float) input_ptr[i]);
|
||||
input_ptr[i] = (T_IN) ((float) tmp_val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T_IN, typename T_fake_type>
|
||||
void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(256);
|
||||
dim3 grid((size + 255) / 256);
|
||||
fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = (float) (src[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
|
||||
}
|
||||
|
||||
__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = (half) ((float) (src[tid]));
|
||||
}
|
||||
}
|
||||
|
||||
void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
|
||||
}
|
||||
|
||||
__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = (__nv_fp8_e4m3) src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
|
||||
}
|
||||
|
||||
__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = (__nv_fp8_e4m3) src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
|
||||
}
|
||||
|
||||
__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
dst[tid] = (__nv_fp8_e4m3) src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
|
||||
}
|
||||
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
__global__ void transpose(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1)
|
||||
{
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
const size_t src_col_id = tid % dim1;
|
||||
const size_t src_row_id = tid / dim1;
|
||||
dst[src_col_id * dim0 + src_row_id] = (T_OUT) (src[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void invokeInPlaceTranspose(T* data, T* workspace, const size_t dim0, const size_t dim1)
|
||||
{
|
||||
// copy data to workspace, and then transpose from workspace to data
|
||||
cudaD2Dcpy(workspace, data, dim0 * dim1);
|
||||
transpose<<<256, 256>>>(data, workspace, dim0, dim1);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeInPlaceTranspose(
|
||||
__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1);
|
||||
#endif // ENABLE_FP8
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeInPlaceTranspose(
|
||||
__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1);
|
||||
#endif // ENABLE_BF16
|
||||
template void invokeInPlaceTranspose(float* data, float* workspace, const size_t dim0, const size_t dim1);
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
__global__ void transpose0213(
|
||||
T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3)
|
||||
{
|
||||
// src permutation: [0, 1, 2, 3]
|
||||
// dst permutation: [0, 2, 1, 3]
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
size_t tmp_idx = tid;
|
||||
const size_t dim_3_idx = tmp_idx % dim3;
|
||||
tmp_idx = (tmp_idx - dim_3_idx) / dim3;
|
||||
const size_t dim_2_idx = tmp_idx % dim2;
|
||||
tmp_idx = (tmp_idx - dim_2_idx) / dim2;
|
||||
const size_t dim_1_idx = tmp_idx % dim1;
|
||||
tmp_idx = (tmp_idx - dim_1_idx) / dim1;
|
||||
const size_t dim_0_idx = tmp_idx % dim0;
|
||||
dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void invokeInPlaceTranspose0213(
|
||||
T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3)
|
||||
{
|
||||
// copy data to workspace, and then transpose from workspace to data
|
||||
// Note that this kernel is used for pre-processing and not very efficient.
|
||||
cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
|
||||
transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeInPlaceTranspose0213(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0,
|
||||
const size_t dim1, const size_t dim2, const size_t dim3);
|
||||
#endif // ENABLE_FP8
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeInPlaceTranspose0213(__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0,
|
||||
const size_t dim1, const size_t dim2, const size_t dim3);
|
||||
#endif // ENABLE_BF16
|
||||
template void invokeInPlaceTranspose0213(
|
||||
float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3);
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
__global__ void transpose102(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2)
|
||||
{
|
||||
// src permutation: [0, 1, 2]
|
||||
// dst permutation: [1, 0, 2]
|
||||
for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
size_t tmp_idx = tid;
|
||||
const size_t dim_2_idx = tmp_idx % dim2;
|
||||
tmp_idx = (tmp_idx - dim_2_idx) / dim2;
|
||||
const size_t dim_1_idx = tmp_idx % dim1;
|
||||
tmp_idx = (tmp_idx - dim_1_idx) / dim1;
|
||||
const size_t dim_0_idx = tmp_idx % dim0;
|
||||
dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void invokeInPlaceTranspose102(T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2)
|
||||
{
|
||||
// copy data to workspace, and then transpose from workspace to data
|
||||
// Note that this kernel is used for pre-processing and not very efficient.
|
||||
cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2);
|
||||
transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeInPlaceTranspose102(
|
||||
__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
|
||||
#endif // ENABLE_FP8
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeInPlaceTranspose102(
|
||||
__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
|
||||
#endif // ENABLE_BF16
|
||||
template void invokeInPlaceTranspose102(
|
||||
float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
|
||||
|
||||
template <typename T>
|
||||
void __global__ multiplyScale(T* tensor, float scale, const size_t size)
|
||||
{
|
||||
for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x)
|
||||
{
|
||||
tensor[index] = (T) (((float) tensor[index]) * scale);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
int block = 256;
|
||||
int grid = (size + 255) / 256;
|
||||
multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
|
||||
}
|
||||
|
||||
template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void __global__ divideScale(T* tensor, float scale, const size_t size)
|
||||
{
|
||||
for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x)
|
||||
{
|
||||
tensor[index] = (T) (((float) tensor[index]) / scale);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
|
||||
{
|
||||
int block = 256;
|
||||
int grid = (size + 255) / 256;
|
||||
divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
|
||||
}
|
||||
|
||||
template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
#ifdef ENABLE_BF16
|
||||
template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
|
||||
template void invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(
|
||||
__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
|
||||
template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
|
||||
template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
|
||||
#ifdef ENABLE_FP8
|
||||
template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
|
||||
template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
|
||||
template void invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(
|
||||
__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
|
||||
#endif
|
||||
|
||||
size_t cuda_datatype_size(TRTLLMCudaDataType dt)
|
||||
{
|
||||
static const std::unordered_map<TRTLLMCudaDataType, size_t> sizes{
|
||||
{TRTLLMCudaDataType::FP32, sizeof(float)}, {TRTLLMCudaDataType::FP16, sizeof(half)}
|
||||
#ifdef ENABLE_BF16
|
||||
,
|
||||
{TRTLLMCudaDataType::BF16, sizeof(__nv_bfloat16)}
|
||||
#endif
|
||||
};
|
||||
|
||||
return sizes.at(dt);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void check_range(T const* buffer, size_t size, T min, T max, bool* d_within_range)
|
||||
{
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
|
||||
{
|
||||
const T val = buffer[i];
|
||||
if (val < min || val > max)
|
||||
{
|
||||
*d_within_range = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool invokeCheckRange(T const* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
|
||||
{
|
||||
cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
|
||||
|
||||
dim3 block(256);
|
||||
dim3 grid((size + 255) / 256);
|
||||
check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
|
||||
|
||||
bool result;
|
||||
cudaD2Hcpy(&result, d_within_range, 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
template bool invokeCheckRange<int>(
|
||||
int const* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
|
||||
|
||||
/*
|
||||
* Determine the total workspace size based on a vector containing multiple variable sizes.
|
||||
*/
|
||||
size_t calcAlignedSize(std::vector<size_t> const& sizes, const size_t ALIGN_BYTES)
|
||||
{
|
||||
const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);
|
||||
// Check ALIGN_BYTES is a power of 2
|
||||
assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0);
|
||||
|
||||
size_t total = 0;
|
||||
for (auto sz : sizes)
|
||||
{
|
||||
total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
|
||||
}
|
||||
|
||||
// We add extra "ALIGN_BYTES - 1" bytes in case the start address passed to the function calcAlignedPointers() is
|
||||
// not aligned.
|
||||
return total + ALIGN_BYTES - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given the address of the workspace and the vector containing multiple variable sizes, calculate the start addresses
|
||||
* of each variable.
|
||||
*/
|
||||
void calcAlignedPointers(
|
||||
std::vector<void*>& outPtrs, void const* p, std::vector<size_t> const& sizes, size_t ALIGN_BYTES)
|
||||
{
|
||||
const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);
|
||||
// Check ALIGN_BYTES is a power of 2
|
||||
assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0);
|
||||
|
||||
// In case the start address is not aligned
|
||||
char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
|
||||
|
||||
outPtrs.reserve(sizes.size());
|
||||
for (auto sz : sizes)
|
||||
{
|
||||
outPtrs.push_back(ptr);
|
||||
ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
@@ -1,292 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
template <typename T>
|
||||
void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
|
||||
|
||||
template <typename T>
|
||||
void deviceMemSetZero(T* ptr, size_t size);
|
||||
|
||||
template <typename T>
|
||||
|
||||
void deviceFree(T*& ptr);
|
||||
|
||||
template <typename T>
|
||||
void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
|
||||
|
||||
template <typename T>
|
||||
void cudaD2Hcpy(T* tgt, T const* src, size_t const size);
|
||||
|
||||
template <typename T>
|
||||
void cudaH2Dcpy(T* tgt, T const* src, size_t const size);
|
||||
|
||||
template <typename T>
|
||||
void cudaD2Dcpy(T* tgt, T const* src, size_t const size, cudaStream_t stream = NULL);
|
||||
|
||||
template <typename T>
|
||||
void cudaAutoCpy(T* tgt, T const* src, size_t const size, cudaStream_t stream = NULL);
|
||||
|
||||
template <typename T>
|
||||
void cudaRandomUniform(T* buffer, size_t const size);
|
||||
|
||||
template <typename T>
|
||||
int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename,
|
||||
TRTLLMCudaDataType model_file_type = TRTLLMCudaDataType::FP32);
|
||||
|
||||
// template<typename T>
|
||||
// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t* quantized_weight_ptr,
|
||||
// T* scale_ptr,
|
||||
// std::vector<size_t> shape,
|
||||
// std::string filename,
|
||||
// TRTLLMCudaDataType model_file_type = TRTLLMCudaDataType::FP32);
|
||||
|
||||
void invokeCudaD2DcpyHalf2Float(float* dst, half* src, size_t const size, cudaStream_t stream);
|
||||
void invokeCudaD2DcpyFloat2Half(half* dst, float* src, size_t const size, cudaStream_t stream);
|
||||
#ifdef ENABLE_FP8
|
||||
void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, size_t const size, cudaStream_t stream);
|
||||
void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, size_t const size, cudaStream_t stream);
|
||||
void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, size_t const size, cudaStream_t stream);
|
||||
void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, size_t const size, cudaStream_t stream);
|
||||
void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, size_t const size, cudaStream_t stream);
|
||||
#endif // ENABLE_FP8
|
||||
#ifdef ENABLE_BF16
|
||||
void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, size_t const size, cudaStream_t stream);
|
||||
#endif // ENABLE_BF16
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
void invokeCudaCast(T_OUT* dst, T_IN const* const src, size_t const size, cudaStream_t stream);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The following functions implement conversion of multi-dimensional indices to an index in a flat array.
|
||||
// The shape of the Tensor dimensions is passed as one array (`dims`), the indices are given as individual arguments.
|
||||
// For examples on how to use these functions, see their tests `test_memory_utils.cu`.
|
||||
// All of these functions can be evaluated at compile time by recursive template expansion.
|
||||
|
||||
template <typename TDim, typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ std::enable_if_t<std::is_pointer<TDim>::value, T> constexpr flat_index(
|
||||
T const& acc, TDim dims, TIndex const& index)
|
||||
{
|
||||
assert(index < dims[0]);
|
||||
return acc * dims[0] + index;
|
||||
}
|
||||
|
||||
template <typename TDim, typename T, typename TIndex, typename... TIndices>
|
||||
__inline__ __host__ __device__ std::enable_if_t<std::is_pointer<TDim>::value, T> constexpr flat_index(
|
||||
T const& acc, TDim dims, TIndex const& index, TIndices... indices)
|
||||
{
|
||||
assert(index < dims[0]);
|
||||
return flat_index(acc * dims[0] + index, dims + 1, indices...);
|
||||
}
|
||||
|
||||
template <typename TDim, typename T>
|
||||
__inline__ __host__ __device__ std::enable_if_t<std::is_pointer<TDim>::value, T> constexpr flat_index(
|
||||
[[maybe_unused]] TDim dims, T const& index)
|
||||
{
|
||||
assert(index < dims[0]);
|
||||
return index;
|
||||
}
|
||||
|
||||
template <typename TDim, typename TIndex, typename... TIndices>
|
||||
__inline__ __host__ __device__
|
||||
std::enable_if_t<std::is_pointer<TDim>::value, typename std::remove_pointer<TDim>::type> constexpr flat_index(
|
||||
TDim dims, TIndex const& index, TIndices... indices)
|
||||
{
|
||||
assert(index < dims[0]);
|
||||
return flat_index(static_cast<typename std::remove_pointer<TDim>::type>(index), dims + 1, indices...);
|
||||
}
|
||||
|
||||
template <unsigned skip = 0, typename T, std::size_t N, typename TIndex, typename... TIndices>
|
||||
__inline__ __host__ __device__ T constexpr flat_index(
|
||||
std::array<T, N> const& dims, TIndex const& index, TIndices... indices)
|
||||
{
|
||||
static_assert(skip < N);
|
||||
static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
|
||||
return flat_index(&dims[skip], index, indices...);
|
||||
}
|
||||
|
||||
template <unsigned skip = 0, typename T, typename TIndex, std::size_t N, typename... TIndices>
|
||||
__inline__ __host__ __device__ T constexpr flat_index(
|
||||
T const& acc, std::array<T, N> const& dims, TIndex const& index, TIndices... indices)
|
||||
{
|
||||
static_assert(skip < N);
|
||||
static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
|
||||
return flat_index(acc, &dims[skip], index, indices...);
|
||||
}
|
||||
|
||||
template <unsigned skip = 0, typename T, typename TIndex, std::size_t N, typename... TIndices>
|
||||
__inline__ __host__ __device__ T constexpr flat_index(T const (&dims)[N], TIndex const& index, TIndices... indices)
|
||||
{
|
||||
static_assert(skip < N);
|
||||
static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
|
||||
return flat_index(static_cast<T const*>(dims) + skip, index, indices...);
|
||||
}
|
||||
|
||||
template <unsigned skip = 0, typename T, typename TIndex, std::size_t N, typename... TIndices>
|
||||
__inline__ __host__ __device__ T constexpr flat_index(
|
||||
T const& acc, T const (&dims)[N], TIndex const& index, TIndices... indices)
|
||||
{
|
||||
static_assert(skip < N);
|
||||
static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
|
||||
return flat_index(acc, static_cast<T const*>(dims) + skip, index, indices...);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// These are simpler functions for multi-dimensional index conversion. Indices and dimensions are passed as individual
|
||||
// arguments. These functions are more suitable for usage inside kernels than the corresponding flat_index functions
|
||||
// which require arrays as arguments. Usage examples can be found in `test_memory_utils.cu`. The functions can be
|
||||
// evaluated at compile time.
|
||||
|
||||
template <typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ T constexpr flat_index2(TIndex const& index_0, TIndex const& index_1, T const& dim_1)
|
||||
{
|
||||
assert(index_1 < dim_1);
|
||||
return index_0 * dim_1 + index_1;
|
||||
}
|
||||
|
||||
template <typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ T constexpr flat_index3(
|
||||
TIndex const& index_0, TIndex const& index_1, TIndex const& index_2, T const& dim_1, T const& dim_2)
|
||||
{
|
||||
assert(index_2 < dim_2);
|
||||
return flat_index2(index_0, index_1, dim_1) * dim_2 + index_2;
|
||||
}
|
||||
|
||||
template <typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ T constexpr flat_index4(TIndex const& index_0, TIndex const& index_1,
|
||||
TIndex const& index_2, TIndex const& index_3, T const& dim_1, T const& dim_2, T const& dim_3)
|
||||
{
|
||||
assert(index_3 < dim_3);
|
||||
return flat_index3(index_0, index_1, index_2, dim_1, dim_2) * dim_3 + index_3;
|
||||
}
|
||||
|
||||
template <typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ T constexpr flat_index5(TIndex const& index_0, TIndex const& index_1,
|
||||
TIndex const& index_2, TIndex const& index_3, TIndex const& index_4, T const& dim_1, T const& dim_2, T const& dim_3,
|
||||
T const& dim_4)
|
||||
{
|
||||
assert(index_4 < dim_4);
|
||||
return flat_index4(index_0, index_1, index_2, index_3, dim_1, dim_2, dim_3) * dim_4 + index_4;
|
||||
}
|
||||
|
||||
template <typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ T constexpr flat_index_strided3(
|
||||
TIndex const& index_0, TIndex const& index_1, TIndex const& index_2, T const& stride_1, T const& stride_2)
|
||||
{
|
||||
assert(index_1 < stride_1 / stride_2);
|
||||
assert(index_2 < stride_2);
|
||||
return index_0 * stride_1 + index_1 * stride_2 + index_2;
|
||||
}
|
||||
|
||||
template <typename T, typename TIndex>
|
||||
__inline__ __host__ __device__ T constexpr flat_index_strided4(TIndex const& index_0, TIndex const& index_1,
|
||||
TIndex const& index_2, TIndex const& index_3, T const& stride_1, T const& stride_2, T const& stride_3)
|
||||
{
|
||||
assert(index_1 < stride_1 / stride_2);
|
||||
assert(index_2 < stride_2 / stride_3);
|
||||
assert(index_3 < stride_3);
|
||||
return index_0 * stride_1 + index_1 * stride_2 + index_2 * stride_3 + index_3;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
void invokeInPlaceTranspose(T* data, T* workspace, size_t const dim0, size_t const dim1);
|
||||
|
||||
template <typename T>
|
||||
void invokeInPlaceTranspose0213(
|
||||
T* data, T* workspace, size_t const dim0, size_t const dim1, size_t const dim2, size_t const dim3);
|
||||
|
||||
template <typename T>
|
||||
void invokeInPlaceTranspose102(T* data, T* workspace, size_t const dim0, size_t const dim1, size_t const dim2);
|
||||
|
||||
template <typename T>
|
||||
void invokeMultiplyScale(T* tensor, float scale, size_t const size, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void invokeDivideScale(T* tensor, float scale, size_t const size, cudaStream_t stream);
|
||||
|
||||
template <typename T_IN, typename T_OUT>
|
||||
void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, size_t const size, cudaStream_t stream = 0);
|
||||
|
||||
template <typename T_IN, typename T_OUT>
|
||||
void invokeCudaD2DScaleCpyConvert(
|
||||
T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, size_t const size, cudaStream_t stream = 0);
|
||||
|
||||
inline bool checkIfFileExist(std::string const& file_path)
|
||||
{
|
||||
std::ifstream in(file_path, std::ios::in | std::ios::binary);
|
||||
if (in.is_open())
|
||||
{
|
||||
in.close();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void saveToBinary(T const* ptr, size_t const size, std::string filename);
|
||||
|
||||
template <typename T_IN, typename T_fake_type>
|
||||
void invokeFakeCast(T_IN* input_ptr, size_t const size, cudaStream_t stream);
|
||||
|
||||
size_t cuda_datatype_size(TRTLLMCudaDataType dt);
|
||||
|
||||
template <typename T>
|
||||
bool invokeCheckRange(T const* buffer, size_t const size, T min, T max, bool* d_within_range, cudaStream_t stream);
|
||||
|
||||
constexpr size_t DEFAULT_ALIGN_BYTES = 256;
|
||||
|
||||
size_t calcAlignedSize(std::vector<size_t> const& sizes, size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES);
|
||||
void calcAlignedPointers(std::vector<void*>& outPtrs, void const* p, std::vector<size_t> const& sizes,
|
||||
size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES);
|
||||
|
||||
struct AlignedPointersUnpacker
|
||||
{
|
||||
template <typename... T>
|
||||
void operator()(T*&... outPtrs)
|
||||
{
|
||||
assert(sizeof...(T) == alignedPointers.size());
|
||||
auto it = alignedPointers.begin();
|
||||
((outPtrs = static_cast<T*>(*it++)), ...);
|
||||
}
|
||||
|
||||
std::vector<void*> alignedPointers;
|
||||
};
|
||||
|
||||
AlignedPointersUnpacker inline calcAlignedPointers(
|
||||
void const* p, std::vector<size_t> const& sizes, size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES)
|
||||
{
|
||||
AlignedPointersUnpacker unpacker{};
|
||||
calcAlignedPointers(unpacker.alignedPointers, p, sizes, ALIGN_BYTES);
|
||||
return unpacker;
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
588
sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp
vendored
588
sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp
vendored
@@ -1,588 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <numeric>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "tensorrt_llm/common/mpiUtils.h"
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include "tensorrt_llm/runtime/iBuffer.h"
|
||||
|
||||
#include <csignal>
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <type_traits>
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// We rely on SizeType32 being int32_t in some places with weak type checking,
|
||||
// i.e. we're passing void ptr to some function. To prevent mysterious errors
|
||||
// in the future, we trigger a compilation error here if SizeType32 isn't int32_t.
|
||||
static_assert(std::is_same<tensorrt_llm::runtime::SizeType32, std::int32_t>::value);
|
||||
|
||||
namespace tensorrt_llm::mpi
|
||||
{
|
||||
|
||||
MPI_Datatype getMpiDtype(MpiType dtype)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
static std::unordered_map<MpiType, MPI_Datatype> const dtype_map{
|
||||
{MpiType::kBYTE, MPI_BYTE},
|
||||
{MpiType::kHALF, MPI_UINT16_T},
|
||||
{MpiType::kFLOAT, MPI_FLOAT},
|
||||
{MpiType::kDOUBLE, MPI_DOUBLE},
|
||||
{MpiType::kBOOL, MPI_C_BOOL},
|
||||
{MpiType::kINT8, MPI_INT8_T},
|
||||
{MpiType::kUINT8, MPI_UINT8_T},
|
||||
{MpiType::kINT32, MPI_INT32_T},
|
||||
{MpiType::kUINT32, MPI_UINT32_T},
|
||||
{MpiType::kINT64, MPI_INT64_T},
|
||||
{MpiType::kUINT64, MPI_UINT64_T},
|
||||
{MpiType::kFP8, MPI_UINT8_T},
|
||||
{MpiType::kBF16, MPI_UINT16_T},
|
||||
{MpiType::kCHAR, MPI_CHAR},
|
||||
};
|
||||
return dtype_map.at(dtype);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif
|
||||
}
|
||||
|
||||
MPI_Op getMpiOp(MpiOp op)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
static std::unordered_map<MpiOp, MPI_Op> const op_map{
|
||||
{MpiOp::NULLOP, MPI_OP_NULL},
|
||||
{MpiOp::MAX, MPI_MAX},
|
||||
{MpiOp::MIN, MPI_MIN},
|
||||
{MpiOp::SUM, MPI_SUM},
|
||||
{MpiOp::PROD, MPI_PROD},
|
||||
{MpiOp::LAND, MPI_LAND},
|
||||
{MpiOp::BAND, MPI_BAND},
|
||||
{MpiOp::LOR, MPI_LOR},
|
||||
{MpiOp::BOR, MPI_BOR},
|
||||
{MpiOp::LXOR, MPI_LXOR},
|
||||
{MpiOp::BXOR, MPI_BXOR},
|
||||
{MpiOp::MINLOC, MPI_MINLOC},
|
||||
{MpiOp::MAXLOC, MPI_MAXLOC},
|
||||
{MpiOp::REPLACE, MPI_REPLACE},
|
||||
};
|
||||
return op_map.at(op);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
bool mpiInitialized = false;
|
||||
std::recursive_mutex mpiMutex;
|
||||
|
||||
MpiComm initLocalSession()
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPI_Comm localComm = nullptr;
|
||||
MPI_Comm_split_type(COMM_SESSION, OMPI_COMM_TYPE_HOST, COMM_SESSION.getRank(), MPI_INFO_NULL, &localComm);
|
||||
MpiComm localSession{localComm, false};
|
||||
#else
|
||||
MpiComm localSession{COMM_SESSION, false};
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
return localSession;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<int> getWorldRanks(MpiComm const& comm)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPI_Group group = nullptr;
|
||||
MPI_Group worldGroup = nullptr;
|
||||
|
||||
MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
|
||||
MPICHECK(MPI_Comm_group(comm, &group));
|
||||
|
||||
int groupSize = 0;
|
||||
MPICHECK(MPI_Group_size(group, &groupSize));
|
||||
std::vector<int> ranks(groupSize);
|
||||
std::vector<int> worldRanks(groupSize);
|
||||
std::iota(ranks.begin(), ranks.end(), 0);
|
||||
|
||||
MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data()));
|
||||
MPICHECK(MPI_Group_free(&group));
|
||||
MPICHECK(MPI_Group_free(&worldGroup));
|
||||
#else
|
||||
std::vector<int> worldRanks{0};
|
||||
#endif
|
||||
return worldRanks;
|
||||
}
|
||||
|
||||
void initialize(MpiThreadSupport threadMode, bool forwardAbortToParent)
|
||||
{
|
||||
// double-checked locking
|
||||
if (mpiInitialized)
|
||||
{
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::recursive_mutex> lk(mpiMutex);
|
||||
if (mpiInitialized)
|
||||
{
|
||||
return;
|
||||
}
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
int initialized = 0;
|
||||
TLLM_MPI_CHECK(MPI_Initialized(&initialized));
|
||||
if (!initialized)
|
||||
{
|
||||
TLLM_LOG_INFO("Initializing MPI with thread mode %d", threadMode);
|
||||
int providedMode = 0;
|
||||
auto requiredMode = static_cast<int>(threadMode);
|
||||
MPICHECK(MPI_Init_thread(nullptr, nullptr, requiredMode, &providedMode));
|
||||
TLLM_CHECK_WITH_INFO(providedMode >= requiredMode, "MPI_Init_thread failed");
|
||||
std::atexit([]() { MPI_Finalize(); });
|
||||
|
||||
/*
|
||||
* We only catch SIGABRT and SIGSEGV because most, of not all errors in the worker will cause one of these 2
|
||||
* signals. Signals like SIGINT and SIGTERM should be issued to the parent and should terminate MPI workers
|
||||
* correctly.
|
||||
*/
|
||||
for (int sig : {SIGABRT, SIGSEGV})
|
||||
{
|
||||
__sighandler_t previousHandler = nullptr;
|
||||
if (forwardAbortToParent)
|
||||
{
|
||||
previousHandler = std::signal(sig,
|
||||
[](int signal)
|
||||
{
|
||||
#ifndef _WIN32
|
||||
pid_t parentProcessId = getppid();
|
||||
kill(parentProcessId, SIGKILL);
|
||||
#endif
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
previousHandler = std::signal(sig, [](int signal) { MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); });
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(previousHandler != SIG_ERR, "Signal handler setup failed");
|
||||
}
|
||||
|
||||
// ensure local MPI communicator is initialized
|
||||
MpiComm::localSession();
|
||||
TLLM_LOG_INFO("Initialized MPI");
|
||||
}
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
mpiInitialized = true;
|
||||
}
|
||||
|
||||
void MpiComm::barrier() const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Barrier(mComm));
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
template <typename TMpiFunc, typename TBase, typename... TArgs,
|
||||
typename = std::enable_if_t<std::is_same_v<void, std::remove_const_t<TBase>>>>
|
||||
size_t invokeChunked(TMpiFunc func, TBase* buffer, size_t size, MPI_Datatype dtype, TArgs... args)
|
||||
{
|
||||
constexpr auto maxP1 = static_cast<size_t>(std::numeric_limits<int>::max()) + 1;
|
||||
if (TLLM_LIKELY(size < maxP1))
|
||||
{
|
||||
MPICHECK(func(buffer, size, dtype, args...));
|
||||
return 1;
|
||||
}
|
||||
|
||||
constexpr size_t alignment = 256;
|
||||
int elementSize = 1;
|
||||
MPICHECK(MPI_Type_size(dtype, &elementSize));
|
||||
elementSize = std::min<int>(elementSize, alignment);
|
||||
|
||||
// We cap at max alignment-bytes chunks that can be sent at once.
|
||||
auto const step = maxP1 - (alignment / elementSize);
|
||||
|
||||
using TCast = std::conditional_t<std::is_const_v<TBase>, uint8_t const, uint8_t>;
|
||||
size_t count = 0;
|
||||
while (size != 0)
|
||||
{
|
||||
auto currentStep = static_cast<int>(std::min(size, step));
|
||||
MPICHECK(func(buffer, currentStep, dtype, args...));
|
||||
size -= currentStep;
|
||||
size_t diff = static_cast<size_t>(currentStep) * elementSize;
|
||||
buffer = static_cast<TCast*>(buffer) + diff;
|
||||
++count;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
std::shared_ptr<MpiRequest> MpiComm::bcastAsync(void* buffer, size_t size, MpiType dtype, int root) const
|
||||
{
|
||||
std::shared_ptr<MpiRequest> r = std::make_shared<MpiRequest>();
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
invokeChunked(MPI_Ibcast, buffer, size, getMpiDtype(dtype), root, mComm, &r->mRequest);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
return r;
|
||||
}
|
||||
|
||||
std::shared_ptr<MpiRequest> MpiComm::bcastAsync(runtime::IBuffer& buf, int root) const
|
||||
{
|
||||
TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
|
||||
return bcastAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
|
||||
}
|
||||
|
||||
void MpiComm::bcast(void* buffer, size_t size, MpiType dtype, int root) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
invokeChunked(MPI_Bcast, buffer, size, getMpiDtype(dtype), root, mComm);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
void MpiComm::bcast(runtime::IBuffer& buf, int root) const
|
||||
{
|
||||
bcast(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
|
||||
}
|
||||
|
||||
std::shared_ptr<MpiRequest> MpiComm::sendAsync(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const
|
||||
{
|
||||
TLLM_LOG_DEBUG("start MPI_Isend with size %d", size);
|
||||
std::shared_ptr<MpiRequest> r = std::make_shared<MpiRequest>();
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
invokeChunked(MPI_Isend, buffer, size, getMpiDtype(dtype), dest, tag, mComm, &r->mRequest);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif
|
||||
TLLM_LOG_DEBUG("end MPI_Isend with size %d", size);
|
||||
return r;
|
||||
}
|
||||
|
||||
std::shared_ptr<MpiRequest> MpiComm::sendAsync(runtime::IBuffer const& buf, int dest, int tag) const
|
||||
{
|
||||
return sendAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag);
|
||||
}
|
||||
|
||||
void MpiComm::send(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const
|
||||
{
|
||||
TLLM_LOG_DEBUG("start MPI_Send with size %d", size);
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
invokeChunked(MPI_Send, buffer, size, getMpiDtype(dtype), dest, tag, mComm);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
TLLM_LOG_DEBUG("end MPI_Send with size %d", size);
|
||||
}
|
||||
|
||||
void MpiComm::send(runtime::IBuffer const& buf, int dest, int tag) const
|
||||
{
|
||||
send(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag);
|
||||
}
|
||||
|
||||
MPI_Status MpiComm::recv(void* buffer, size_t size, MpiType dtype, int source, int tag) const
|
||||
{
|
||||
TLLM_LOG_DEBUG("start MPI_Recv with size %d", size);
|
||||
MPI_Status status{};
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
invokeChunked(MPI_Recv, buffer, size, getMpiDtype(dtype), source, tag, mComm, &status);
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
TLLM_LOG_DEBUG("end MPI_Recv with size %d", size);
|
||||
return status;
|
||||
}
|
||||
|
||||
MPI_Status MpiComm::recv(runtime::IBuffer& buf, int source, int tag) const
|
||||
{
|
||||
return recv(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, source, tag);
|
||||
}
|
||||
|
||||
MpiComm MpiComm::split(int color, int key) const
|
||||
{
|
||||
MPI_Comm splitComm = nullptr;
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Comm_split(mComm, color, key, &splitComm));
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
return MpiComm{splitComm, true};
|
||||
}
|
||||
|
||||
void MpiComm::allreduce(void const* sendbuf, void* recvbuf, int count, MpiType dtype, MpiOp op) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Allreduce(sendbuf, recvbuf, count, getMpiDtype(dtype), getMpiOp(op), mComm));
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
void MpiComm::allgather(void const* sendbuf, void* recvbuf, int count, MpiType dtype) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Allgather(sendbuf, count, getMpiDtype(dtype), recvbuf, count, getMpiDtype(dtype), mComm));
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
void MpiComm::allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf,
|
||||
std::vector<int> const& recvcounts, std::vector<int> const& displs, MpiType recvtype) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Allgatherv(sendbuf, sendcount, getMpiDtype(sendtype), recvbuf, recvcounts.data(), displs.data(),
|
||||
getMpiDtype(recvtype), mComm));
|
||||
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Mprobe(source, tag, mComm, msg, status));
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
bool MpiComm::improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
int flag{0};
|
||||
MPICHECK(MPI_Improbe(source, tag, mComm, &flag, msg, status));
|
||||
return flag != 0;
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool MpiComm::iprobe(int source, int tag, MPI_Status* status) const
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
int flag{0};
|
||||
MPICHECK(MPI_Iprobe(source, tag, mComm, &flag, status));
|
||||
return flag != 0;
|
||||
#else
|
||||
TLLM_THROW("Multi device support is disabled.");
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
void MpiComm::recvPoll(int source, int tag, int periodMs) const
|
||||
{
|
||||
MPI_Status status;
|
||||
while (!iprobe(source, tag, &status))
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(periodMs));
|
||||
}
|
||||
}
|
||||
|
||||
int MpiComm::getRank() const
|
||||
{
|
||||
int rank = 0;
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Comm_rank(mComm, &rank));
|
||||
#endif
|
||||
return rank;
|
||||
}
|
||||
|
||||
int MpiComm::getSize() const
|
||||
{
|
||||
int world_size = 1;
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
MPICHECK(MPI_Comm_size(mComm, &world_size));
|
||||
#endif
|
||||
return world_size;
|
||||
}
|
||||
|
||||
MpiComm const& MpiComm::world()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
static MpiComm commWorld{MPI_COMM_WORLD, false};
|
||||
initialize();
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return commWorld;
|
||||
}
|
||||
|
||||
MpiComm& MpiComm::mutableSession()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
static MpiComm commSession{MPI_COMM_WORLD, false};
|
||||
initialize();
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return commSession;
|
||||
}
|
||||
|
||||
MpiComm& MpiComm::mutableLocalSession()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
static MpiComm localSession = initLocalSession();
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return localSession;
|
||||
}
|
||||
|
||||
void MpiComm::refreshLocalSession()
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
static std::mutex mutex;
|
||||
std::unique_lock lock(mutex);
|
||||
auto initSessionRanks = getWorldRanks(MpiComm::session());
|
||||
auto localSessionRanks = getWorldRanks(MpiComm::localSession());
|
||||
|
||||
// Add to intersectionRanks in order of initSessionRanks
|
||||
std::vector<int> intersectionRanks;
|
||||
std::unordered_set<int> localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end());
|
||||
for (auto rank : initSessionRanks)
|
||||
{
|
||||
if (localSessionRanksSet.find(rank) != localSessionRanksSet.end())
|
||||
{
|
||||
intersectionRanks.push_back(rank);
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Group worldGroup = nullptr;
|
||||
MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
|
||||
MPI_Group localGroup = nullptr;
|
||||
MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
|
||||
MPI_Comm localComm = nullptr;
|
||||
MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
|
||||
MpiComm::mutableLocalSession().mFreeComm = true;
|
||||
MpiComm::mutableLocalSession() = MpiComm{localComm, false};
|
||||
TLLM_LOG_INFO("Refreshed the MPI local session");
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
MpiComm::MpiComm(MPI_Comm g, bool freeComm)
|
||||
: mComm{g}
|
||||
, mFreeComm{freeComm}
|
||||
{
|
||||
TLLM_CHECK(mComm != MPI_COMM_NULL);
|
||||
}
|
||||
|
||||
MpiComm::~MpiComm() noexcept
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
if (mFreeComm && mComm)
|
||||
{
|
||||
if (MPI_Comm_free(&mComm) != MPI_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_ERROR("MPI_Comm_free failed");
|
||||
}
|
||||
}
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
MpiComm::MpiComm(MpiComm&& comm) noexcept
|
||||
: mComm{comm.mComm}
|
||||
, mFreeComm{comm.mFreeComm}
|
||||
{
|
||||
comm.mFreeComm = false;
|
||||
}
|
||||
|
||||
MpiComm& MpiComm::operator=(MpiComm&& comm) noexcept
|
||||
{
|
||||
this->~MpiComm();
|
||||
mComm = comm.mComm;
|
||||
mFreeComm = comm.mFreeComm;
|
||||
comm.mFreeComm = false;
|
||||
return *this;
|
||||
}
|
||||
|
||||
MpiWaitThread::MpiWaitThread(std::string name, std::function<void()> funcWait, std::function<void()> funcSetup)
|
||||
: mName{name.c_str()}
|
||||
, mFuncWait{funcWait}
|
||||
, mFuncSetup{funcSetup}
|
||||
{
|
||||
TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
mThread = std::make_unique<std::thread>(&MpiWaitThread::sideThread, this);
|
||||
TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
MpiWaitThread::~MpiWaitThread()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
waitStop();
|
||||
mShouldExit.store(true);
|
||||
notifyStart();
|
||||
mThread->join();
|
||||
mThread.reset(nullptr);
|
||||
TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void MpiWaitThread::sideThread()
|
||||
{
|
||||
if (mFuncSetup)
|
||||
{
|
||||
mFuncSetup();
|
||||
}
|
||||
while (!mShouldExit.load())
|
||||
{
|
||||
notifyStop();
|
||||
waitStart();
|
||||
mFuncWait();
|
||||
}
|
||||
}
|
||||
|
||||
void MpiWaitThread::waitStart()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
std::unique_lock<std::mutex> lock(mMutex);
|
||||
mCondVar.wait(lock, [this] { return mRunning; });
|
||||
TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void MpiWaitThread::waitStop()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
std::unique_lock<std::mutex> lock(mMutex);
|
||||
mCondVar.wait(lock, [this] { return !mRunning; });
|
||||
TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void MpiWaitThread::notifyStart()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
mRunning = true;
|
||||
mCondVar.notify_one();
|
||||
TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void MpiWaitThread::notifyStop()
|
||||
{
|
||||
TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
std::lock_guard<std::mutex> lock(mMutex);
|
||||
mRunning = false;
|
||||
mCondVar.notify_one();
|
||||
TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::mpi
|
||||
@@ -1,46 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace tensorrt_llm::common::nvtx
|
||||
{
|
||||
inline nvtx3::color nextColor()
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
constexpr std::array kColors{nvtx3::color{0xff00ff00}, nvtx3::color{0xff0000ff}, nvtx3::color{0xffffff00},
|
||||
nvtx3::color{0xffff00ff}, nvtx3::color{0xff00ffff}, nvtx3::color{0xffff0000}, nvtx3::color{0xffffffff}};
|
||||
constexpr auto numColors = kColors.size();
|
||||
|
||||
static thread_local std::size_t colorId = 0;
|
||||
auto const color = kColors[colorId];
|
||||
colorId = colorId + 1 >= numColors ? 0 : colorId + 1;
|
||||
return color;
|
||||
#else
|
||||
return nvtx3::color{0};
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nvtx
|
||||
|
||||
#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \
|
||||
::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
|
||||
#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)
|
||||
323
sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp
vendored
323
sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp
vendored
@@ -1,323 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/common/mpiUtils.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include <cstdint>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_fp8.h>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define FN_NAME __FUNCTION__
|
||||
#else
|
||||
#define FN_NAME __func__
|
||||
#endif
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
|
||||
{
|
||||
static std::unordered_map<nvinfer1::DataType, ncclDataType_t> dtypeMap = {{nvinfer1::DataType::kFLOAT, ncclFloat32},
|
||||
{nvinfer1::DataType::kHALF, ncclFloat16}, {nvinfer1::DataType::kBF16, ncclBfloat16}};
|
||||
return &dtypeMap;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Get NCCL unique ID for a group of ranks.
|
||||
ncclUniqueId getUniqueId(std::set<int> const& group) noexcept
|
||||
{
|
||||
auto const rank = COMM_SESSION.getRank();
|
||||
TLLM_LOG_TRACE("%s start for rank %d", __PRETTY_FUNCTION__, rank);
|
||||
ncclUniqueId id;
|
||||
if (rank == *group.begin())
|
||||
{
|
||||
NCCLCHECK(ncclGetUniqueId(&id));
|
||||
for (auto it = std::next(std::begin(group), 1); it != group.end(); ++it)
|
||||
{
|
||||
COMM_SESSION.sendValue(id, *it, 0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
COMM_SESSION.recvValue(id, *group.begin(), 0);
|
||||
}
|
||||
TLLM_LOG_TRACE("%s stop for rank %d", __PRETTY_FUNCTION__, rank);
|
||||
return id;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group)
|
||||
{
|
||||
auto const rank = COMM_SESSION.getRank();
|
||||
TLLM_LOG_TRACE("%s start for rank %d", __PRETTY_FUNCTION__, rank);
|
||||
static std::map<std::set<int>, std::shared_ptr<ncclComm_t>> commMap;
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
std::ostringstream oss;
|
||||
int index = 0;
|
||||
for (auto const& rank : group)
|
||||
{
|
||||
if (index != 0)
|
||||
{
|
||||
oss << ",";
|
||||
}
|
||||
oss << rank;
|
||||
index++;
|
||||
}
|
||||
auto groupStr = oss.str();
|
||||
auto it = commMap.find(group);
|
||||
if (it != commMap.end())
|
||||
{
|
||||
auto ncclComm = it->second;
|
||||
TLLM_LOG_TRACE("NCCL comm for group(%s) is cached for rank %d", groupStr.c_str(), rank);
|
||||
return ncclComm;
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE("Init NCCL comm for group(%s) for rank %d", groupStr.c_str(), rank);
|
||||
ncclUniqueId id = getUniqueId(group);
|
||||
int groupRank = 0;
|
||||
for (auto const& currentRank : group)
|
||||
{
|
||||
if (rank == currentRank)
|
||||
break;
|
||||
++groupRank;
|
||||
}
|
||||
TLLM_CHECK(groupRank < group.size());
|
||||
std::shared_ptr<ncclComm_t> ncclComm(new ncclComm_t,
|
||||
[](ncclComm_t* comm)
|
||||
{
|
||||
ncclCommDestroy(*comm);
|
||||
delete comm;
|
||||
});
|
||||
NCCLCHECK(ncclCommInitRank(ncclComm.get(), group.size(), id, groupRank));
|
||||
commMap[group] = ncclComm;
|
||||
TLLM_LOG_TRACE("%s stop for rank %d", __PRETTY_FUNCTION__, rank);
|
||||
return ncclComm;
|
||||
}
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
void const* tensorrt_llm::common::getCommSessionHandle()
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
return &COMM_SESSION;
|
||||
#else
|
||||
return nullptr;
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Get current cuda context, a default context will be created if there is no context.
|
||||
inline CUcontext getCurrentCudaCtx()
|
||||
{
|
||||
CUcontext ctx{};
|
||||
CUresult err = cuCtxGetCurrent(&ctx);
|
||||
if (err == CUDA_ERROR_NOT_INITIALIZED || ctx == nullptr)
|
||||
{
|
||||
TLLM_CUDA_CHECK(cudaFree(nullptr));
|
||||
err = cuCtxGetCurrent(&ctx);
|
||||
}
|
||||
TLLM_CHECK(err == CUDA_SUCCESS);
|
||||
return ctx;
|
||||
}
|
||||
|
||||
// Helper to create per-cuda-context singleton managed by std::shared_ptr.
|
||||
// Unlike conventional singletons, singleton created with this will be released
|
||||
// when not needed, instead of on process exit.
|
||||
// Objects of this class shall always be declared static / global, and shall never own CUDA
|
||||
// resources.
|
||||
template <typename T>
|
||||
class PerCudaCtxSingletonCreator
|
||||
{
|
||||
public:
|
||||
using CreatorFunc = std::function<std::unique_ptr<T>()>;
|
||||
using DeleterFunc = std::function<void(T*)>;
|
||||
|
||||
// creator returning std::unique_ptr is by design.
|
||||
// It forces separation of memory for T and memory for control blocks.
|
||||
// So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released.
|
||||
// creator itself must not own CUDA resources. Only the object it creates can.
|
||||
PerCudaCtxSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
|
||||
: mCreator{std::move(creator)}
|
||||
, mDeleter{std::move(deleter)}
|
||||
{
|
||||
}
|
||||
|
||||
std::shared_ptr<T> operator()()
|
||||
{
|
||||
std::lock_guard<std::mutex> lk{mMutex};
|
||||
CUcontext ctx{getCurrentCudaCtx()};
|
||||
std::shared_ptr<T> result = mObservers[ctx].lock();
|
||||
if (result == nullptr)
|
||||
{
|
||||
// Create the resource and register with an observer.
|
||||
result = std::shared_ptr<T>{mCreator().release(),
|
||||
[this, ctx](T* obj)
|
||||
{
|
||||
if (obj == nullptr)
|
||||
{
|
||||
return;
|
||||
}
|
||||
mDeleter(obj);
|
||||
|
||||
// Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
|
||||
// frequently.
|
||||
std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
|
||||
std::lock_guard<std::mutex> lk{mMutex};
|
||||
// Must check observer again because another thread may created new instance for this ctx just
|
||||
// before we lock mMutex. We can't infer that the observer is stale from the fact that obj is
|
||||
// destroyed, because shared_ptr ref-count checking and observer removing are not in one atomic
|
||||
// operation, and the observer may be changed to observe another instance.
|
||||
observedObjHolder = mObservers.at(ctx).lock();
|
||||
if (observedObjHolder == nullptr)
|
||||
{
|
||||
mObservers.erase(ctx);
|
||||
}
|
||||
}};
|
||||
mObservers.at(ctx) = result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
CreatorFunc mCreator;
|
||||
DeleterFunc mDeleter;
|
||||
mutable std::mutex mMutex;
|
||||
// CUDA resources are per-context.
|
||||
std::unordered_map<CUcontext, std::weak_ptr<T>> mObservers;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class PerThreadSingletonCreator
|
||||
{
|
||||
public:
|
||||
using CreatorFunc = std::function<std::unique_ptr<T>()>;
|
||||
using DeleterFunc = std::function<void(T*)>;
|
||||
|
||||
// creator returning std::unique_ptr is by design.
|
||||
// It forces separation of memory for T and memory for control blocks.
|
||||
// So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released.
|
||||
// creator itself must not own CUDA resources. Only the object it creates can.
|
||||
PerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
|
||||
: mCreator{std::move(creator)}
|
||||
, mDeleter{std::move(deleter)}
|
||||
{
|
||||
}
|
||||
|
||||
std::shared_ptr<T> operator()()
|
||||
{
|
||||
std::lock_guard<std::mutex> lk{mMutex};
|
||||
|
||||
std::thread::id thread = std::this_thread::get_id();
|
||||
std::shared_ptr<T> result = mObservers[thread].lock();
|
||||
|
||||
if (result == nullptr)
|
||||
{
|
||||
// Create the resource and register with an observer.
|
||||
result = std::shared_ptr<T>{mCreator().release(),
|
||||
[this, thread](T* obj)
|
||||
{
|
||||
if (obj == nullptr)
|
||||
{
|
||||
return;
|
||||
}
|
||||
mDeleter(obj);
|
||||
|
||||
// Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
|
||||
// frequently.
|
||||
std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
|
||||
std::lock_guard<std::mutex> lk{mMutex};
|
||||
// Must check observer again because another thread may created new instance for this ctx just
|
||||
// before we lock mMutex. We can't infer that the observer is stale from the fact that obj is
|
||||
// destroyed, because shared_ptr ref-count checking and observer removing are not in one atomic
|
||||
// operation, and the observer may be changed to observe another instance.
|
||||
observedObjHolder = mObservers.at(thread).lock();
|
||||
if (observedObjHolder == nullptr)
|
||||
{
|
||||
mObservers.erase(thread);
|
||||
}
|
||||
}};
|
||||
mObservers.at(thread) = result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
CreatorFunc mCreator;
|
||||
DeleterFunc mDeleter;
|
||||
mutable std::mutex mMutex;
|
||||
// CUDA resources are per-thread.
|
||||
std::unordered_map<std::thread::id, std::weak_ptr<T>> mObservers;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::shared_ptr<cublasHandle_t> getCublasHandle()
|
||||
{
|
||||
static PerThreadSingletonCreator<cublasHandle_t> creator(
|
||||
[]() -> auto
|
||||
{
|
||||
auto handle = std::unique_ptr<cublasHandle_t>(new cublasHandle_t);
|
||||
TLLM_CUDA_CHECK(cublasCreate(handle.get()));
|
||||
return handle;
|
||||
},
|
||||
[](cublasHandle_t* handle)
|
||||
{
|
||||
TLLM_CUDA_CHECK(cublasDestroy(*handle));
|
||||
delete handle;
|
||||
});
|
||||
return creator();
|
||||
}
|
||||
|
||||
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
|
||||
{
|
||||
static PerThreadSingletonCreator<cublasLtHandle_t> creator(
|
||||
[]() -> auto
|
||||
{
|
||||
auto handle = std::unique_ptr<cublasLtHandle_t>(new cublasLtHandle_t);
|
||||
TLLM_CUDA_CHECK(cublasLtCreate(handle.get()));
|
||||
return handle;
|
||||
},
|
||||
[](cublasLtHandle_t* handle)
|
||||
{
|
||||
TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
|
||||
delete handle;
|
||||
});
|
||||
return creator();
|
||||
}
|
||||
|
||||
std::shared_ptr<tensorrt_llm::common::CublasMMWrapper> getCublasMMWrapper(std::shared_ptr<cublasHandle_t> cublasHandle,
|
||||
std::shared_ptr<cublasLtHandle_t> cublasltHandle, cudaStream_t stream, void* workspace)
|
||||
{
|
||||
static PerThreadSingletonCreator<tensorrt_llm::common::CublasMMWrapper> creator(
|
||||
[cublasHandle, cublasltHandle, stream, workspace]() -> auto
|
||||
{
|
||||
auto wrapper = std::unique_ptr<tensorrt_llm::common::CublasMMWrapper>(
|
||||
new tensorrt_llm::common::CublasMMWrapper(cublasHandle, cublasltHandle, stream, workspace));
|
||||
return wrapper;
|
||||
},
|
||||
[](tensorrt_llm::common::CublasMMWrapper* wrapper) { delete wrapper; });
|
||||
return creator();
|
||||
}
|
||||
215
sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h
vendored
215
sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h
vendored
@@ -1,215 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/workspace.h"
|
||||
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cublasLt.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime.h>
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
#include <nccl.h>
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <nvml.h>
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
// Write values into buffer
|
||||
template <typename T>
|
||||
void write(char*& buffer, T const& val)
|
||||
{
|
||||
std::memcpy(buffer, &val, sizeof(T));
|
||||
buffer += sizeof(T);
|
||||
}
|
||||
|
||||
// Read values from buffer
|
||||
template <typename T>
|
||||
void read(char const*& buffer, T& val)
|
||||
{
|
||||
std::memcpy(&val, buffer, sizeof(T));
|
||||
buffer += sizeof(T);
|
||||
}
|
||||
|
||||
// Like std::unique_ptr, but does not prevent generation of default copy constructor when used as class members.
|
||||
// The copy constructor produces nullptr. So the plugin default copy constructor will not really copy this, and
|
||||
// your clone() implementation is responsible for initializing such data members.
|
||||
// With this we can simplify clone() implementation when there are many data members including at least one unique_ptr.
|
||||
template <typename T, typename Del = std::default_delete<T>>
|
||||
class UniqPtrWNullCopy : public std::unique_ptr<T, Del>
|
||||
{
|
||||
public:
|
||||
using std::unique_ptr<T, Del>::unique_ptr;
|
||||
|
||||
// for compatibility with std::make_unique
|
||||
explicit UniqPtrWNullCopy(std::unique_ptr<T, Del>&& src)
|
||||
: std::unique_ptr<T, Del>::unique_ptr{std::move(src)}
|
||||
{
|
||||
}
|
||||
|
||||
// copy constructor produces nullptr
|
||||
UniqPtrWNullCopy(UniqPtrWNullCopy const&)
|
||||
: std::unique_ptr<T, Del>::unique_ptr{}
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
// for testing only
|
||||
void const* getCommSessionHandle();
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
inline bool isBuilding()
|
||||
{
|
||||
auto constexpr key = "IS_BUILDING";
|
||||
auto const val = getenv(key);
|
||||
return val != nullptr && std::string(val) == "1";
|
||||
}
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
#define NCCLCHECK(cmd) \
|
||||
do \
|
||||
{ \
|
||||
ncclResult_t r = cmd; \
|
||||
if (r != ncclSuccess) \
|
||||
{ \
|
||||
printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap();
|
||||
|
||||
std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
//! To save GPU memory, all the plugins share the same cublas and cublasLt handle globally.
|
||||
//! Get cublas and cublasLt handle for current cuda context
|
||||
std::shared_ptr<cublasHandle_t> getCublasHandle();
|
||||
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
|
||||
std::shared_ptr<tensorrt_llm::common::CublasMMWrapper> getCublasMMWrapper(std::shared_ptr<cublasHandle_t> cublasHandle,
|
||||
std::shared_ptr<cublasLtHandle_t> cublasltHandle, cudaStream_t stream, void* workspace);
|
||||
|
||||
#ifndef DEBUG
|
||||
|
||||
#define PLUGIN_CHECK(status) \
|
||||
do \
|
||||
{ \
|
||||
if (status != 0) \
|
||||
abort(); \
|
||||
} while (0)
|
||||
|
||||
#define ASSERT_PARAM(exp) \
|
||||
do \
|
||||
{ \
|
||||
if (!(exp)) \
|
||||
return STATUS_BAD_PARAM; \
|
||||
} while (0)
|
||||
|
||||
#define ASSERT_FAILURE(exp) \
|
||||
do \
|
||||
{ \
|
||||
if (!(exp)) \
|
||||
return STATUS_FAILURE; \
|
||||
} while (0)
|
||||
|
||||
#define CSC(call, err) \
|
||||
do \
|
||||
{ \
|
||||
cudaError_t cudaStatus = call; \
|
||||
if (cudaStatus != cudaSuccess) \
|
||||
{ \
|
||||
return err; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define DEBUG_PRINTF(...) \
|
||||
do \
|
||||
{ \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define ASSERT_PARAM(exp) \
|
||||
do \
|
||||
{ \
|
||||
if (!(exp)) \
|
||||
{ \
|
||||
fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
|
||||
return STATUS_BAD_PARAM; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define ASSERT_FAILURE(exp) \
|
||||
do \
|
||||
{ \
|
||||
if (!(exp)) \
|
||||
{ \
|
||||
fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
|
||||
return STATUS_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CSC(call, err) \
|
||||
do \
|
||||
{ \
|
||||
cudaError_t cudaStatus = call; \
|
||||
if (cudaStatus != cudaSuccess) \
|
||||
{ \
|
||||
printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
|
||||
return err; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PLUGIN_CHECK(status) \
|
||||
{ \
|
||||
if (status != 0) \
|
||||
{ \
|
||||
DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define DEBUG_PRINTF(...) \
|
||||
do \
|
||||
{ \
|
||||
printf(__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#endif // DEBUG
|
||||
|
||||
#define NVML_CHECK(cmd) \
|
||||
do \
|
||||
{ \
|
||||
nvmlReturn_t r = cmd; \
|
||||
if (r != NVML_SUCCESS) \
|
||||
{ \
|
||||
printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
358
sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
vendored
Normal file
358
sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
vendored
Normal file
@@ -0,0 +1,358 @@
|
||||
/*
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
class QuantMode
|
||||
{
|
||||
// [WARNING] KEEP BELOW DEFINITION IN SYNC WITH tensorrt_llm/quantization/mode.py
|
||||
public:
|
||||
using BaseType = std::uint32_t;
|
||||
|
||||
explicit constexpr QuantMode(BaseType value) noexcept
|
||||
: mValue{value}
|
||||
{
|
||||
}
|
||||
|
||||
QuantMode() noexcept = default;
|
||||
|
||||
constexpr QuantMode(QuantMode const&) noexcept = default;
|
||||
|
||||
constexpr QuantMode& operator=(QuantMode const& other) noexcept = default;
|
||||
|
||||
static constexpr QuantMode none() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(0));
|
||||
}
|
||||
|
||||
static constexpr QuantMode int4Weights() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 0);
|
||||
}
|
||||
|
||||
static constexpr QuantMode int8Weights() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 1);
|
||||
}
|
||||
|
||||
static constexpr QuantMode activations() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 2);
|
||||
}
|
||||
|
||||
static constexpr QuantMode perChannelScaling() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 3);
|
||||
}
|
||||
|
||||
static constexpr QuantMode perTokenScaling() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 4);
|
||||
}
|
||||
|
||||
static constexpr QuantMode perGroupScaling() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 5);
|
||||
}
|
||||
|
||||
static constexpr QuantMode int8KvCache() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 6);
|
||||
}
|
||||
|
||||
static constexpr QuantMode fp8KvCache() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 7);
|
||||
}
|
||||
|
||||
static constexpr QuantMode fp8Qdq() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 8);
|
||||
}
|
||||
|
||||
static constexpr QuantMode fp8RowWise() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 3 | BaseType(1u) << 4 | BaseType(1u) << 9);
|
||||
}
|
||||
|
||||
static constexpr QuantMode w4a8QServe() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 10);
|
||||
}
|
||||
|
||||
constexpr BaseType value() const noexcept
|
||||
{
|
||||
return mValue;
|
||||
}
|
||||
|
||||
constexpr bool isSet(QuantMode const& mode) const noexcept
|
||||
{
|
||||
return (mValue & mode.value()) == mode.value();
|
||||
}
|
||||
|
||||
constexpr bool hasInt4Weights() const noexcept
|
||||
{
|
||||
return isSet(int4Weights());
|
||||
}
|
||||
|
||||
constexpr bool hasInt8Weights() const noexcept
|
||||
{
|
||||
return isSet(int8Weights());
|
||||
}
|
||||
|
||||
constexpr bool hasActivations() const noexcept
|
||||
{
|
||||
return isSet(activations());
|
||||
}
|
||||
|
||||
constexpr bool hasPerChannelScaling() const noexcept
|
||||
{
|
||||
return isSet(perChannelScaling());
|
||||
}
|
||||
|
||||
constexpr bool hasPerTokenScaling() const noexcept
|
||||
{
|
||||
return isSet(perTokenScaling());
|
||||
}
|
||||
|
||||
constexpr bool hasPerGroupScaling() const noexcept
|
||||
{
|
||||
return isSet(perGroupScaling());
|
||||
}
|
||||
|
||||
constexpr bool hasStaticActivationScaling() const noexcept
|
||||
{
|
||||
return !hasPerTokenScaling();
|
||||
}
|
||||
|
||||
constexpr bool hasInt8KvCache() const noexcept
|
||||
{
|
||||
return isSet(int8KvCache());
|
||||
}
|
||||
|
||||
constexpr bool hasFp8KvCache() const noexcept
|
||||
{
|
||||
return isSet(fp8KvCache());
|
||||
}
|
||||
|
||||
constexpr bool hasFp8Qdq() const noexcept
|
||||
{
|
||||
return isSet(fp8Qdq());
|
||||
}
|
||||
|
||||
constexpr bool hasFp8RowWise() const noexcept
|
||||
{
|
||||
return isSet(fp8RowWise());
|
||||
}
|
||||
|
||||
constexpr bool hasKvCacheQuant() const noexcept
|
||||
{
|
||||
return hasInt8KvCache() || hasFp8KvCache();
|
||||
}
|
||||
|
||||
static constexpr QuantMode fromDescription(bool quantizeWeights = false, bool quantizeActivations = false,
|
||||
bool perToken = false, bool perChannel = false, bool perGroup = false, bool useInt4Weights = false,
|
||||
bool useInt8KvCache = false, bool useFp8KvCache = false, bool useFp8Qdq = false, bool useFp8RowWise = false,
|
||||
bool useW4a8QServe = false)
|
||||
{
|
||||
QuantMode quantMode{};
|
||||
if (quantizeWeights)
|
||||
{
|
||||
if (useInt4Weights)
|
||||
quantMode += int4Weights();
|
||||
else
|
||||
quantMode += int8Weights();
|
||||
}
|
||||
|
||||
if (quantizeActivations)
|
||||
{
|
||||
quantMode += activations();
|
||||
}
|
||||
|
||||
if (perChannel)
|
||||
{
|
||||
quantMode += QuantMode::perChannelScaling();
|
||||
}
|
||||
if (perToken)
|
||||
{
|
||||
quantMode += QuantMode::perTokenScaling();
|
||||
}
|
||||
if (perGroup)
|
||||
{
|
||||
quantMode += QuantMode::perGroupScaling();
|
||||
}
|
||||
|
||||
if (useInt8KvCache)
|
||||
{
|
||||
quantMode += int8KvCache();
|
||||
}
|
||||
|
||||
if (useFp8KvCache)
|
||||
{
|
||||
quantMode += fp8KvCache();
|
||||
}
|
||||
|
||||
if (useFp8Qdq)
|
||||
{
|
||||
quantMode += fp8Qdq();
|
||||
}
|
||||
|
||||
if (useFp8RowWise)
|
||||
{
|
||||
quantMode += fp8RowWise();
|
||||
}
|
||||
|
||||
if (useW4a8QServe)
|
||||
{
|
||||
quantMode += w4a8QServe();
|
||||
}
|
||||
|
||||
return quantMode;
|
||||
}
|
||||
|
||||
static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)
|
||||
{
|
||||
return fromDescription(true, true, perToken, perChannel);
|
||||
}
|
||||
|
||||
static constexpr QuantMode useQServe(bool perGroup)
|
||||
{
|
||||
return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true);
|
||||
}
|
||||
|
||||
static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)
|
||||
{
|
||||
return fromDescription(true, false, false, false, perGroup, useInt4Weights);
|
||||
}
|
||||
|
||||
static QuantMode const fromQuantAlgo(
|
||||
std::optional<std::string> quantAlgo = std::nullopt, std::optional<std::string> kvCacheQuantAlgo = std::nullopt)
|
||||
{
|
||||
QuantMode quantMode{};
|
||||
if (quantAlgo == "W8A16")
|
||||
{
|
||||
quantMode = useWeightOnly(false, false);
|
||||
}
|
||||
else if (quantAlgo == "W4A16")
|
||||
{
|
||||
quantMode = useWeightOnly(true, false);
|
||||
}
|
||||
else if (quantAlgo == "W4A16_AWQ")
|
||||
{
|
||||
quantMode = useWeightOnly(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W4A8_AWQ")
|
||||
{
|
||||
quantMode = useWeightOnly(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W4A8_QSERVE_PER_GROUP")
|
||||
{
|
||||
quantMode = useQServe(false);
|
||||
}
|
||||
else if (quantAlgo == "W4A8_QSERVE_PER_CHANNEL")
|
||||
{
|
||||
quantMode = useQServe(true);
|
||||
}
|
||||
else if (quantAlgo == "W4A16_GPTQ")
|
||||
{
|
||||
quantMode = useWeightOnly(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_CHANNEL")
|
||||
{
|
||||
quantMode = useSmoothQuant(false, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(false, false);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(false, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(true, false);
|
||||
}
|
||||
else if (quantAlgo == "FP8")
|
||||
{
|
||||
quantMode = fromDescription(false, false, false, false, false, false, false, false, true);
|
||||
}
|
||||
else if (quantAlgo == "FP8_ROWWISE")
|
||||
{
|
||||
quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true);
|
||||
}
|
||||
|
||||
if (kvCacheQuantAlgo == "INT8")
|
||||
{
|
||||
quantMode += int8KvCache();
|
||||
}
|
||||
else if (kvCacheQuantAlgo == "FP8")
|
||||
{
|
||||
quantMode += fp8KvCache();
|
||||
}
|
||||
|
||||
return quantMode;
|
||||
}
|
||||
|
||||
constexpr QuantMode operator+(QuantMode const& other) const noexcept
|
||||
{
|
||||
return QuantMode(mValue | other.mValue);
|
||||
}
|
||||
|
||||
constexpr QuantMode& operator+=(QuantMode const& other) noexcept
|
||||
{
|
||||
return *this = *this + other;
|
||||
}
|
||||
|
||||
constexpr QuantMode operator-(QuantMode const& other) const noexcept
|
||||
{
|
||||
return QuantMode(mValue & ~other.mValue);
|
||||
}
|
||||
|
||||
constexpr QuantMode& operator-=(QuantMode const& other) noexcept
|
||||
{
|
||||
return *this = *this - other;
|
||||
}
|
||||
|
||||
constexpr bool operator==(QuantMode const& other) const noexcept
|
||||
{
|
||||
return mValue == other.mValue;
|
||||
}
|
||||
|
||||
constexpr bool operator!=(QuantMode const& other) const noexcept
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
private:
|
||||
BaseType mValue{0};
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
123
sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h
vendored
123
sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h
vendored
@@ -1,123 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
|
||||
namespace tensorrt_llm::common::stl_utils
|
||||
{
|
||||
|
||||
template <typename TInputIt, typename TOutputIt, typename TBinOp>
|
||||
constexpr TOutputIt basicInclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, TBinOp op)
|
||||
{
|
||||
if (first != last)
|
||||
{
|
||||
auto val = *first;
|
||||
while (true)
|
||||
{
|
||||
*dFirst = val;
|
||||
++dFirst;
|
||||
++first;
|
||||
if (first == last)
|
||||
{
|
||||
break;
|
||||
}
|
||||
val = op(std::move(val), *first);
|
||||
}
|
||||
}
|
||||
return dFirst;
|
||||
}
|
||||
|
||||
template <typename TInputIt, typename TOutputIt>
|
||||
constexpr TOutputIt inclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst)
|
||||
{
|
||||
#if defined(__GNUC__) && __GNUC__ <= 8
|
||||
return basicInclusiveScan(first, last, dFirst, std::plus<>{});
|
||||
#else
|
||||
return std::inclusive_scan(first, last, dFirst);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TInputIt, typename TOutputIt, typename T, typename TBinOp>
|
||||
constexpr TOutputIt basicExclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, T init, TBinOp op)
|
||||
{
|
||||
if (first != last)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
T tmp{op(init, *first)};
|
||||
*dFirst = init;
|
||||
++dFirst;
|
||||
++first;
|
||||
if (first == last)
|
||||
{
|
||||
break;
|
||||
}
|
||||
init = std::move(tmp);
|
||||
}
|
||||
}
|
||||
return dFirst;
|
||||
}
|
||||
|
||||
template <typename TInputIt, typename TOutputIt, typename T>
|
||||
constexpr TOutputIt exclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, T init)
|
||||
{
|
||||
#if defined(__GNUC__) && __GNUC__ <= 8
|
||||
return basicExclusiveScan(first, last, dFirst, std::move(init), std::plus<>{});
|
||||
#else
|
||||
return std::exclusive_scan(first, last, dFirst, std::move(init));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct HasOperatorOutput : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct HasOperatorOutput<T, std::void_t<decltype((std::declval<std::ostream&>() << std::declval<T>()))>>
|
||||
: std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string toString(T const& t, typename std::enable_if_t<HasOperatorOutput<T>::value, int> = 0)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << t;
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOperatorOutput<T>::value, int> = 0)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
if (t)
|
||||
{
|
||||
oss << t.value();
|
||||
}
|
||||
else
|
||||
{
|
||||
oss << "None";
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::stl_utils
|
||||
113
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
vendored
Normal file
113
sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if ENABLE_BF16
|
||||
#include <cuda_bf16.h>
|
||||
#endif // ENABLE_BF16
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include <memory> // std::make_unique
|
||||
#include <sstream> // std::stringstream
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
#if ENABLE_BF16
|
||||
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
|
||||
{
|
||||
stream << __bfloat162float(val);
|
||||
return stream;
|
||||
}
|
||||
#endif // ENABLE_BF16
|
||||
|
||||
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __half const& val)
|
||||
{
|
||||
stream << __half2float(val);
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline std::string fmtstr(std::string const& s)
|
||||
{
|
||||
return s;
|
||||
}
|
||||
|
||||
inline std::string fmtstr(std::string&& s)
|
||||
{
|
||||
return s;
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
std::string fmtstr(char const* format, ...);
|
||||
#else
|
||||
std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2)));
|
||||
#endif
|
||||
|
||||
// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows
|
||||
// The alternative is __FUNCSIG__, which is similar but not identical
|
||||
#if defined(_WIN32)
|
||||
#define __PRETTY_FUNCTION__ __FUNCSIG__
|
||||
#endif
|
||||
|
||||
auto constexpr kDefaultDelimiter = ", ";
|
||||
|
||||
template <typename U, typename TStream, typename T>
|
||||
inline TStream& arr2outCasted(TStream& out, T* arr, size_t size, char const* delim = kDefaultDelimiter)
|
||||
{
|
||||
out << "(";
|
||||
if (size > 0)
|
||||
{
|
||||
for (size_t i = 0; i < size - 1; ++i)
|
||||
{
|
||||
out << static_cast<U>(arr[i]) << delim;
|
||||
}
|
||||
out << static_cast<U>(arr[size - 1]);
|
||||
}
|
||||
out << ")";
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename TStream, typename T>
|
||||
inline TStream& arr2out(TStream& out, T* arr, size_t size, char const* delim = kDefaultDelimiter)
|
||||
{
|
||||
return arr2outCasted<T>(out, arr, size, delim);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::string arr2str(T* arr, size_t size, char const* delim = kDefaultDelimiter)
|
||||
{
|
||||
std::stringstream ss;
|
||||
return arr2out(ss, arr, size, delim).str();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::string vec2str(std::vector<T> const& vec, char const* delim = kDefaultDelimiter)
|
||||
{
|
||||
return arr2str(vec.data(), vec.size(), delim);
|
||||
}
|
||||
|
||||
inline bool strStartsWith(std::string const& str, std::string const& prefix)
|
||||
{
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
}
|
||||
|
||||
/// @brief Split a string into a set of strings using a delimiter
|
||||
std::unordered_set<std::string> str2set(std::string const& input, char delimiter);
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "tensorrt_llm/common/timestampUtils.h"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
std::string getCurrentTimestamp()
|
||||
{
|
||||
auto now = std::chrono::system_clock::now();
|
||||
auto now_t = std::chrono::system_clock::to_time_t(now);
|
||||
auto tm = *std::localtime(&now_t);
|
||||
|
||||
auto epoch_to_now = now.time_since_epoch();
|
||||
auto seconds = std::chrono::duration_cast<std::chrono::seconds>(epoch_to_now);
|
||||
auto us = std::chrono::duration_cast<std::chrono::microseconds>(epoch_to_now - seconds);
|
||||
|
||||
std::ostringstream stream;
|
||||
stream << std::put_time(&tm, "%m-%d-%Y %H:%M:%S");
|
||||
stream << "." << std::setfill('0') << std::setw(6) << us.count();
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
@@ -14,12 +14,35 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#define NEW_TLLM_EXCEPTION(...) \
|
||||
tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__))
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
|
||||
/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
|
||||
std::string getCurrentTimestamp();
|
||||
class TllmException : public std::runtime_error
|
||||
{
|
||||
public:
|
||||
static auto constexpr MAX_FRAMES = 128;
|
||||
|
||||
explicit TllmException(char const* file, std::size_t line, std::string const& msg);
|
||||
|
||||
~TllmException() noexcept override;
|
||||
|
||||
[[nodiscard]] std::string getTrace() const;
|
||||
|
||||
static std::string demangle(char const* name);
|
||||
|
||||
private:
|
||||
std::array<void*, MAX_FRAMES> mCallstack{};
|
||||
int mNbFrames;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
Reference in New Issue
Block a user