From 9602c2aac76d2655d4d9aa657e60accde1cfb51f Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Fri, 31 Jan 2025 00:39:47 +0800
Subject: [PATCH] keep the parts needed for moe_kernels (#3218)

---
 .../tensorrt_llm/common/CMakeLists.txt        |  22 -
 .../3rdparty/tensorrt_llm/common/assert.cpp   |   0
 .../3rdparty/tensorrt_llm/common/assert.h     |  92 ++
 .../tensorrt_llm/common/cudaDriverWrapper.cpp | 187 ++++
 .../tensorrt_llm/common/cudaDriverWrapper.h   | 138 +++
 .../tensorrt_llm/common/cudaFp8Utils.h        | 239 +++++
 .../tensorrt_llm/common/cudaProfilerUtils.cpp |  84 --
 .../3rdparty/tensorrt_llm/common/cudaUtils.h  | 641 +++++++++++++
 .../common/customAllReduceUtils.h             |  36 -
 .../3rdparty/tensorrt_llm/common/envUtils.cpp | 214 -----
 .../3rdparty/tensorrt_llm/common/envUtils.h   |  60 --
 .../3rdparty/tensorrt_llm/common/logger.h     | 190 ++++
 .../3rdparty/tensorrt_llm/common/mathUtils.h  |  37 -
 .../tensorrt_llm/common/memoryUtils.cu        | 906 ------------------
 .../tensorrt_llm/common/memoryUtils.h         | 292 ------
 .../3rdparty/tensorrt_llm/common/mpiUtils.cpp | 588 ------------
 .../3rdparty/tensorrt_llm/common/nvtxUtils.h  |  46 -
 .../3rdparty/tensorrt_llm/common/opUtils.cpp  | 323 -------
 .../3rdparty/tensorrt_llm/common/opUtils.h    | 215 -----
 .../tensorrt_llm/common/quantization.h        | 358 +++++++
 .../3rdparty/tensorrt_llm/common/stlUtils.h   | 123 ---
 .../tensorrt_llm/common/stringUtils.h         | 113 +++
 .../tensorrt_llm/common/timestampUtils.cpp    |  42 -
 .../{timestampUtils.h => tllmException.h}     |  27 +-
 24 files changed, 1983 insertions(+), 2990 deletions(-)
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt
 mode change 100755 => 100644 sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h
 create mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
 delete mode 100644 sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp
 rename sgl-kernel/3rdparty/tensorrt_llm/common/{timestampUtils.h => tllmException.h} (50%)

diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt b/sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt
deleted file mode 100644
index e479b298d..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not
-# use this file except in compliance with the License. You may obtain a copy of
-# the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations under
-# the License.
-#
-file(GLOB SRCS *.cpp)
-file(GLOB CU_SRCS *.cu)
-
-add_library(common_src OBJECT ${SRCS} ${CU_SRCS})
-set_property(TARGET common_src PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET common_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.cpp
old mode 100755
new mode 100644
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
new file mode 100644
index 000000000..7f51dbf1b
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/assert.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/stringUtils.h"
+#include "tensorrt_llm/common/tllmException.h"
+
+#include <string>
+
+namespace tensorrt_llm::common
+{
+[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
+{
+    throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()));
+}
+
+} // namespace tensorrt_llm::common
+
+class DebugConfig
+{
+public:
+    static bool isCheckDebugEnabled();
+};
+
+#if defined(_WIN32)
+#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
+#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
+#else
+#define TLLM_LIKELY(x) __builtin_expect((x), 1)
+#define TLLM_UNLIKELY(x) __builtin_expect((x), 0)
+#endif
+
+#define TLLM_CHECK(val)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                               \
+                                            : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);       \
+    } while (0)
+
+#define TLLM_CHECK_WITH_INFO(val, info, ...)                                                                           \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        TLLM_LIKELY(static_cast<bool>(val))                                                                            \
+        ? ((void) 0)                                                                                                   \
+        : tensorrt_llm::common::throwRuntimeError(                                                                     \
+            __FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__));                                    \
+    } while (0)
+
+#define TLLM_CHECK_DEBUG(val)                                                                                          \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        {                                                                                                              \
+            TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                           \
+                                                : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);   \
+        }                                                                                                              \
+    } while (0)
+
+#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...)                                                                     \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        {                                                                                                              \
+            TLLM_LIKELY(static_cast<bool>(val))                                                                        \
+            ? ((void) 0)                                                                                               \
+            : tensorrt_llm::common::throwRuntimeError(                                                                 \
+                __FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__));                                \
+        }                                                                                                              \
+    } while (0)
+
+#define TLLM_THROW(...)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        throw NEW_TLLM_EXCEPTION(__VA_ARGS__);                                                                         \
+    } while (0)
+
+#define TLLM_WRAP(ex)                                                                                                  \
+    NEW_TLLM_EXCEPTION("%s: %s", tensorrt_llm::common::TllmException::demangle(typeid(ex).name()).c_str(), ex.what())
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
new file mode 100644
index 000000000..7eca46a1c
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define CUDA_LIB_NAME "cuda"
+
+#if defined(_WIN32)
+#include <windows.h>
+#define dllOpen(name) LoadLibrary("nv" name ".dll")
+#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
+#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
+#else // For non-Windows platforms
+#include <dlfcn.h>
+#define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY)
+#define dllClose(handle) dlclose(handle)
+#define dllGetSym(handle, name) dlsym(handle, name)
+#endif // defined(_WIN32)
+
+#include "cudaDriverWrapper.h"
+#include "tensorrt_llm/common/assert.h"
+#include <cstdio>
+#include <cuda.h>
+
+namespace tensorrt_llm::common
+{
+
+std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
+{
+    static std::mutex mutex;
+    static std::weak_ptr<CUDADriverWrapper> instance;
+    std::shared_ptr<CUDADriverWrapper> result = instance.lock();
+    if (result)
+    {
+        return result;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex);
+    result = instance.lock();
+    if (!result)
+    {
+        result = std::shared_ptr<CUDADriverWrapper>(new CUDADriverWrapper());
+        instance = result;
+    }
+    return result;
+}
+
+CUDADriverWrapper::CUDADriverWrapper()
+    : handle(dllOpen(CUDA_LIB_NAME))
+{
+
+    TLLM_CHECK_WITH_INFO(handle != nullptr, "CUDA driver library is not open correctly.");
+
+    auto load_sym = [](void* handle, char const* name)
+    {
+        void* ret = dllGetSym(handle, name);
+        return ret;
+    };
+
+    *reinterpret_cast<void**>(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
+    *reinterpret_cast<void**>(&_cuGetErrorMessage) = load_sym(handle, "cuGetErrorMessage");
+    *reinterpret_cast<void**>(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
+    *reinterpret_cast<void**>(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
+    *reinterpret_cast<void**>(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
+    *reinterpret_cast<void**>(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
+    *reinterpret_cast<void**>(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
+    *reinterpret_cast<void**>(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
+    *reinterpret_cast<void**>(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
+    *reinterpret_cast<void**>(&_cuModuleGetGlobal) = load_sym(handle, "cuModuleGetGlobal_v2");
+    *reinterpret_cast<void**>(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
+    *reinterpret_cast<void**>(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
+    *reinterpret_cast<void**>(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
+    *reinterpret_cast<void**>(&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
+    *reinterpret_cast<void**>(&_cuTensorMapEncodeTiled) = load_sym(handle, "cuTensorMapEncodeTiled");
+    *reinterpret_cast<void**>(&_cuMemcpyDtoH) = load_sym(handle, "cuMemcpyDtoH_v2");
+}
+
+CUDADriverWrapper::~CUDADriverWrapper()
+{
+    dllClose(handle);
+}
+
+CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, char const** pStr) const
+{
+    return (*_cuGetErrorName)(error, pStr);
+}
+
+CUresult CUDADriverWrapper::cuGetErrorMessage(CUresult error, char const** pStr) const
+{
+    return (*_cuGetErrorMessage)(error, pStr);
+}
+
+CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const
+{
+    return (*_cuFuncSetAttribute)(hfunc, attrib, value);
+}
+
+CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
+{
+    return (*_cuLinkComplete)(state, cubinOut, sizeOut);
+}
+
+CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
+{
+    return (*_cuModuleUnload)(hmod);
+}
+
+CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
+{
+    return (*_cuLinkDestroy)(state);
+}
+
+CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, void const* image) const
+{
+    return (*_cuModuleLoadData)(module, image);
+}
+
+CUresult CUDADriverWrapper::cuLinkCreate(
+    unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
+{
+    return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, char const* name) const
+{
+    return (*_cuModuleGetFunction)(hfunc, hmod, name);
+}
+
+CUresult CUDADriverWrapper::cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, char const* name) const
+{
+    return (*_cuModuleGetGlobal)(dptr, bytes, hmod, name);
+}
+
+CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, char const* path,
+    unsigned int numOptions, CUjit_option* options, void** optionValues) const
+{
+    return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size,
+    char const* name, unsigned int numOptions, CUjit_option* options, void** optionValues) const
+{
+    return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+    unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const
+{
+    return (*_cuLaunchCooperativeKernel)(
+        f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+    unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) const
+{
+    return (*_cuLaunchKernel)(
+        f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDADriverWrapper::cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
+    cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const
+{
+    return (*_cuTensorMapEncodeTiled)(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, globalStrides,
+        boxDim, elementStrides, interleave, swizzle, l2Promotion, oobFill);
+}
+
+CUresult CUDADriverWrapper::cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const
+{
+    return (*_cuMemcpyDtoH)(dstHost, srcDevice, ByteCount);
+}
+
+} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
new file mode 100644
index 000000000..c4d470a85
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaDriverWrapper.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDA_DRIVER_WRAPPER_H
+#define CUDA_DRIVER_WRAPPER_H
+
+#include "tensorrt_llm/common/assert.h"
+#include <cstdio>
+#include <cuda.h>
+#include <memory>
+#include <mutex>
+
+namespace tensorrt_llm::common
+{
+
+class CUDADriverWrapper
+{
+public:
+    static std::shared_ptr<CUDADriverWrapper> getInstance();
+
+    ~CUDADriverWrapper();
+    CUDADriverWrapper(CUDADriverWrapper const&) = delete;
+    CUDADriverWrapper operator=(CUDADriverWrapper const&) = delete;
+    CUDADriverWrapper(CUDADriverWrapper&&) = delete;
+    CUDADriverWrapper operator=(CUDADriverWrapper&&) = delete;
+
+    CUresult cuGetErrorName(CUresult error, char const** pStr) const;
+
+    CUresult cuGetErrorMessage(CUresult error, char const** pStr) const;
+
+    CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;
+
+    CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;
+
+    CUresult cuModuleUnload(CUmodule hmod) const;
+
+    CUresult cuLinkDestroy(CUlinkState state) const;
+
+    CUresult cuModuleLoadData(CUmodule* module, void const* image) const;
+
+    CUresult cuLinkCreate(
+        unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;
+
+    CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, char const* name) const;
+
+    CUresult cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, char const* name) const;
+
+    CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, char const* path, unsigned int numOptions,
+        CUjit_option* options, void** optionValues) const;
+
+    CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, char const* name,
+        unsigned int numOptions, CUjit_option* options, void** optionValues) const;
+
+    CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+        unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+        unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const;
+
+    CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+        unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
+        CUstream hStream, void** kernelParams, void** extra) const;
+
+    CUresult cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank,
+        void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides, cuuint32_t const* boxDim,
+        cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+        CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const;
+
+    CUresult cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) const;
+
+private:
+    void* handle;
+    CUDADriverWrapper();
+
+    CUresult (*_cuGetErrorName)(CUresult, char const**);
+    CUresult (*_cuGetErrorMessage)(CUresult, char const**);
+    CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
+    CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
+    CUresult (*_cuModuleUnload)(CUmodule);
+    CUresult (*_cuLinkDestroy)(CUlinkState);
+    CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
+    CUresult (*_cuModuleLoadData)(CUmodule*, void const*);
+    CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, char const*);
+    CUresult (*_cuModuleGetGlobal)(CUdeviceptr*, size_t*, CUmodule, char const*);
+    CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, char const*, unsigned int, CUjit_option*, void**);
+    CUresult (*_cuLinkAddData)(
+        CUlinkState, CUjitInputType, void*, size_t, char const*, unsigned int, CUjit_option*, void**);
+    CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+        unsigned int, unsigned int, unsigned int, CUstream, void**);
+    CUresult (*_cuLaunchKernel)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+        unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
+        CUstream hStream, void** kernelParams, void** extra);
+    CUresult (*_cuTensorMapEncodeTiled)(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType,
+        cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
+        cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
+        CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+    CUresult (*_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+};
+
+template <typename T>
+void checkDriver(
+    T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file, int const line)
+{
+    if (result)
+    {
+        char const* errorName = nullptr;
+        char const* errorMsg = nullptr;
+        wrap.cuGetErrorName(result, &errorName);
+        wrap.cuGetErrorMessage(result, &errorMsg);
+        throw TllmException(
+            file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s", func, errorName, errorMsg));
+    }
+}
+
+} // namespace tensorrt_llm::common
+
+/*
+ * Macros compliant with TensorRT coding conventions
+ */
+#define TLLM_CU_CHECK(stat)                                                                                            \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        tensorrt_llm::common::checkDriver(                                                                             \
+            (stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__);               \
+    } while (0)
+
+#endif // CUDA_DRIVER_WRAPPER_H
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
new file mode 100644
index 000000000..aa93b55a5
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaFp8Utils.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef ENABLE_FP8
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#define FP8_MHA
+#define FUSE_GEMM_ACT
+#define FP8_GEMM_OUTPUT_QUANT_DISABLE
+
+#ifdef FUSE_GEMM_ACT
+#define USE_QGMMA
+#endif
+
+namespace tensorrt_llm
+{
+namespace common
+{
+
+constexpr float FP8_E4M3_MAX = 448.0f;
+
+enum QuantizeMode
+{
+    PER_CHANNEL,
+    PER_TENSOR,
+    PER_CHANNEL_WEIGHT_PER_TENSOR_ACT,
+    PER_TOKEN,
+};
+
+// Packed Data Type
+typedef struct __CUDA_ALIGN__(32)
+{
+    float array[8];
+} float8;
+
+typedef struct __CUDA_ALIGN__(16)
+{
+    half array[8];
+} half8;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    half2 array[2];
+} half2_2;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    half array[4];
+} half_4;
+
+#ifdef ENABLE_BF16
+typedef struct __CUDA_ALIGN__(4)
+{
+    __nv_bfloat16 array[2];
+} __nv_bfloat16_2;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    __nv_bfloat162 x, y;
+} __nv_bfloat162_2_xy;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    __nv_bfloat16 array[4];
+} __nv_bfloat164;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    __nv_bfloat162 array[2];
+} __nv_bfloat162_2;
+
+typedef struct __CUDA_ALIGN__(16)
+{
+    __nv_bfloat16 array[8];
+} __nv_bfloat168;
+
+typedef struct __CUDA_ALIGN__(16)
+{
+    __nv_bfloat162 array[4];
+} __nv_bfloat162_4;
+
+typedef struct __CUDA_ALIGN__(32)
+{
+    __nv_bfloat16 array[16];
+} __nv_bfloat1616;
+#endif
+
+#ifdef ENABLE_FP8
+typedef struct __CUDA_ALIGN__(2)
+{
+    __nv_fp8_e4m3 array[2];
+} __nv_fp8_2_e4m3;
+
+typedef struct __CUDA_ALIGN__(4)
+{
+    __nv_fp8_e4m3 array[4];
+} __nv_fp8_4_e4m3;
+
+typedef struct __CUDA_ALIGN__(4)
+{
+    __nv_fp8x2_e4m3 array[2];
+} __nv_fp8x2_x2_e4m3;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    __nv_fp8_e4m3 array[8];
+} __nv_fp8_8_e4m3;
+
+typedef struct __CUDA_ALIGN__(8)
+{
+    __nv_fp8x2_e4m3 array[4];
+} __nv_fp8x2_x4_e4m3;
+
+typedef struct __CUDA_ALIGN__(16)
+{
+    __nv_fp8_e4m3 array[16];
+} __nv_fp8x16_e4m3;
+#endif
+
+// only BF16 and FP8
+template <typename T, int PACK_SIZE>
+struct PackType
+{
+    using type = float;
+};
+
+#ifdef ENABLE_BF16
+template <>
+struct PackType<__nv_bfloat16, 2>
+{
+    using type = __nv_bfloat16_2;
+};
+
+template <>
+struct PackType<__nv_bfloat16, 4>
+{
+    using type = __nv_bfloat164;
+};
+
+template <>
+struct PackType<__nv_bfloat16, 8>
+{
+    using type = __nv_bfloat168;
+};
+#endif
+
+#ifdef ENABLE_FP8
+template <>
+struct PackType<__nv_fp8_e4m3, 2>
+{
+    using type = __nv_fp8_2_e4m3;
+};
+
+template <>
+struct PackType<__nv_fp8_e4m3, 4>
+{
+    using type = __nv_fp8_4_e4m3;
+};
+
+template <>
+struct PackType<__nv_fp8_e4m3, 8>
+{
+    using type = __nv_fp8_8_e4m3;
+};
+#endif
+
+__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, __nv_fp8x4_e4m3 const* in)
+{
+    const char4 tmp_val = reinterpret_cast<char4 const*>(in)[0];
+    *out1 = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
+        (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
+    *out2 = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.z)[0],
+        (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.w)[0]);
+}
+
+__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(__nv_fp8x2_e4m3 const* in)
+{
+    const char2 tmp_val = reinterpret_cast<char2 const*>(in)[0];
+    __nv_bfloat162 out = __nv_bfloat162((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
+        (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
+    return out;
+}
+
+__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, __nv_fp8x4_e4m3 const* in)
+{
+    const char4 tmp_val = reinterpret_cast<char4 const*>(in)[0];
+    *out1 = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
+        (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
+    *out2 = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.z)[0],
+        (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.w)[0]);
+}
+
+__inline__ __device__ half2 fp8x2_e4m3_to_half2(__nv_fp8x2_e4m3 const* in)
+{
+    const char2 tmp_val = reinterpret_cast<char2 const*>(in)[0];
+    half2 out = half2((float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.x)[0],
+        (float) reinterpret_cast<__nv_fp8_e4m3 const*>(&tmp_val.y)[0]);
+    return out;
+}
+
+template <typename T_OUT, typename T_S, typename T_IN>
+void invokeQuantizeMatrix(T_OUT* output, T_S const* input_qua_amax_ptr, T_IN const* input, int64_t numel, int64_t lda,
+    QuantizeMode quantize_mode, cudaStream_t stream);
+
+template <typename T_OUT, typename T_S, typename T_IN>
+void invokeDequantizeMatrix(T_OUT* output, T_S const* input_qua_amax_ptr, T_IN const* input, int64_t numel, int64_t lda,
+    QuantizeMode quantize_mode, cudaStream_t stream);
+
+template <typename T_FAKE, typename T_OUT, typename T_IN>
+void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int64_t numel, cudaStream_t stream);
+
+template <typename T_S, typename T_W>
+void invokeComputeFP8QuantizeScale(T_S* quant_ptr, const T_W* weights, const int64_t k, const int64_t lda,
+    QuantizeMode quantize_mode, cudaStream_t stream);
+
+template <typename T_OUT, typename T_S, typename T_IN>
+void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T_IN* weights, const int64_t numel,
+    const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
+
+} // namespace common
+} // namespace tensorrt_llm
+#endif // ENABLE_FP8
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp
deleted file mode 100644
index 5576fe782..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaProfilerUtils.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tensorrt_llm/common/cudaProfilerUtils.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/common/stringUtils.h"
-#include <cstdint>
-#include <optional>
-
-namespace
-{
-
-std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexesImpl(
-    std::string const& envVarName)
-{
-    auto envVarVal = std::getenv(envVarName.c_str());
-    auto envVarValStr = std::string{envVarVal != nullptr ? envVarVal : ""};
-    auto values = tensorrt_llm::common::str2set(envVarValStr, ',');
-    std::unordered_set<int32_t> startSet;
-    std::unordered_set<int32_t> endSet;
-    for (std::string const& value : values)
-    {
-        size_t dashIdx = value.find("-");
-        if (dashIdx != std::string::npos)
-        {
-            int32_t start = std::stoi(value.substr(0, dashIdx));
-            startSet.insert(start);
-            int32_t end = std::stoi(value.substr(dashIdx + 1));
-            endSet.insert(end);
-        }
-        else
-        {
-            int32_t start_end = std::stoi(value);
-            startSet.insert(start_end);
-            endSet.insert(start_end);
-        }
-    }
-
-    return std::make_pair(startSet, endSet);
-}
-
-} // namespace
-
-namespace tensorrt_llm::common
-{
-
-std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
-    std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName)
-{
-    auto [profileIterIdxs, stopIterIdxs] = populateIterationIndexesImpl(envVarName);
-
-    // If empty, try to use legacy env var name
-    if (legacyEnvVarName && profileIterIdxs.empty() && stopIterIdxs.empty())
-    {
-        std::tie(profileIterIdxs, stopIterIdxs) = populateIterationIndexesImpl(legacyEnvVarName.value());
-
-        if (!profileIterIdxs.empty() || !stopIterIdxs.empty())
-        {
-            TLLM_LOG_WARNING(
-                "Using deprecated environment variable %s to specify cudaProfiler start and stop iterations. "
-                "Please "
-                "use %s "
-                "instead.",
-                legacyEnvVarName.value().c_str(), envVarName.c_str());
-        }
-    }
-
-    return std::make_pair(profileIterIdxs, stopIterIdxs);
-}
-
-} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
new file mode 100644
index 000000000..13ee3367e
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/cudaUtils.h
@@ -0,0 +1,641 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "tensorrt_llm/common/cudaBf16Wrapper.h"
+#include "tensorrt_llm/common/cudaDriverWrapper.h"
+#include "tensorrt_llm/common/cudaFp8Utils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/tllmException.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <string>
+#ifndef _WIN32 // Linux
+#include <sys/sysinfo.h>
+#endif         // not WIN32
+#include <vector>
+#ifdef _WIN32  // Windows
+#include <windows.h>
+#undef ERROR   // A Windows header file defines ERROR as 0, but it's used in our logger.h enum. Logging breaks without
+               // this undef.
+#endif         // WIN32
+
+namespace tensorrt_llm::common
+{
+
+// workspace for cublas gemm : 32MB
+#define CUBLAS_WORKSPACE_SIZE 33554432
+
+typedef struct __align__(4)
+{
+    half x, y, z, w;
+}
+
+half4;
+
+/* **************************** type definition ***************************** */
+
+enum CublasDataType
+{
+    FLOAT_DATATYPE = 0,
+    HALF_DATATYPE = 1,
+    BFLOAT16_DATATYPE = 2,
+    INT8_DATATYPE = 3,
+    FP8_DATATYPE = 4
+};
+
+enum TRTLLMCudaDataType
+{
+    FP32 = 0,
+    FP16 = 1,
+    BF16 = 2,
+    INT8 = 3,
+    FP8 = 4
+};
+
+enum class OperationType
+{
+    FP32,
+    FP16,
+    BF16,
+    INT8,
+    FP8
+};
+
+/* **************************** debug tools ********************************* */
+static char const* _cudaGetErrorEnum(cudaError_t error)
+{
+    return cudaGetErrorString(error);
+}
+
+static char const* _cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error)
+    {
+    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+template <typename T>
+void check(T result, char const* const func, char const* const file, int const line)
+{
+    if (result)
+    {
+        throw TllmException(
+            file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result)));
+    }
+}
+
+template <typename T>
+void checkEx(T result, std::initializer_list<T> const& validReturns, char const* const func, char const* const file,
+    int const line)
+{
+    if (std::all_of(std::begin(validReturns), std::end(validReturns), [&result](T const& t) { return t != result; }))
+    {
+        throw TllmException(
+            file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA runtime error in %s: %s", func, _cudaGetErrorEnum(result)));
+    }
+}
+
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
+
+inline std::optional<bool> isCudaLaunchBlocking()
+{
+    static bool firstCall = true;
+    static std::optional<bool> result = std::nullopt;
+
+    if (firstCall)
+    {
+        char const* env = std::getenv("CUDA_LAUNCH_BLOCKING");
+        if (env != nullptr && std::string(env) == "1")
+        {
+            result = true;
+        }
+        else if (env != nullptr && std::string(env) == "0")
+        {
+            result = false;
+        }
+        firstCall = false;
+    }
+
+    return result;
+}
+
+inline bool doCheckError()
+{
+    auto const cudaLaunchBlocking = isCudaLaunchBlocking();
+#ifndef NDEBUG
+    bool const checkError = cudaLaunchBlocking.value_or(true);
+#else
+    bool const checkError = cudaLaunchBlocking.value_or(false);
+#endif
+
+    return checkError;
+}
+
+inline void syncAndCheck(char const* const file, int const line)
+{
+    if (doCheckError())
+    {
+        cudaDeviceSynchronize();
+        check(cudaGetLastError(), "cudaGetLastError", file, line);
+    }
+}
+
+#define sync_check_cuda_error() tensorrt_llm::common::syncAndCheck(__FILE__, __LINE__)
+
+#define PRINT_FUNC_NAME_()                                                                                             \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        std::cout << "[TensorRT-LLM][CALL] " << __FUNCTION__ << " " << std::endl;                                      \
+    } while (0)
+
+// clang-format off
+template<typename T> struct packed_type;
+template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
+template <>          struct packed_type<half>          { using type = half2; };
+
+#ifdef ENABLE_BF16
+template<>
+struct packed_type<__nv_bfloat16> {
+    using type = __nv_bfloat162;
+};
+#endif
+
+#ifdef ENABLE_FP8
+template<>
+struct packed_type<__nv_fp8_e4m3> {
+    using type = __nv_fp8x2_e4m3;
+};
+#endif
+
+template<typename T> struct num_elems;
+template <>          struct num_elems<float>           { static constexpr int value = 1; };
+template <>          struct num_elems<float2>          { static constexpr int value = 2; };
+template <>          struct num_elems<float4>          { static constexpr int value = 4; };
+template <>          struct num_elems<half>            { static constexpr int value = 1; };
+template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+#ifdef ENABLE_BF16
+template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
+template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
+#endif
+#ifdef ENABLE_FP8
+template <>          struct num_elems<__nv_fp8_e4m3>   { static constexpr int value = 1; };
+template <>          struct num_elems<__nv_fp8x2_e4m3>  { static constexpr int value = 2; };
+#endif
+
+template<typename T, int num> struct packed_as;
+template<typename T>          struct packed_as<T, 1>              { using type = T; };
+template<>                    struct packed_as<half,  2>          { using type = half2; };
+template<>                    struct packed_as<float,  2>         { using type = float2; };
+template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
+template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
+template<>                    struct packed_as<half2, 1>          { using type = half; };
+template<>                    struct packed_as<float2, 1>         { using type = float; };
+#ifdef ENABLE_BF16
+template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
+template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
+#endif
+#ifdef ENABLE_FP8
+template<> struct packed_as<__nv_fp8_e4m3,  2> { using type = __nv_fp8x2_e4m3; };
+template<> struct packed_as<__nv_fp8x2_e4m3, 1> { using type = __nv_fp8_e4m3;  };
+template<> struct packed_as<__nv_fp8_e5m2,  2> { using type = __nv_fp8x2_e5m2; };
+template<> struct packed_as<__nv_fp8x2_e5m2, 1> { using type = __nv_fp8_e5m2;  };
+#endif
+
+inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+inline __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); }
+inline __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+
+inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
+inline __device__ float2 operator+(float2 a, float  b) { return make_float2(a.x + b, a.y + b); }
+inline __device__ float2 operator-(float2 a, float  b) { return make_float2(a.x - b, a.y - b); }
+
+// clang-format on
+
+template <typename T>
+struct CudaDataType
+{
+};
+
+template <>
+struct CudaDataType<float>
+{
+    static constexpr cudaDataType_t value = cudaDataType::CUDA_R_32F;
+};
+
+template <>
+struct CudaDataType<half>
+{
+    static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16F;
+};
+
+#ifdef ENABLE_BF16
+template <>
+struct CudaDataType<__nv_bfloat16>
+{
+    static constexpr cudaDataType_t value = cudaDataType::CUDA_R_16BF;
+};
+#endif
+
+inline int getSMVersion()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_major = 0;
+    int sm_minor = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+    return sm_major * 10 + sm_minor;
+}
+
+inline int getDevice()
+{
+    int current_dev_id = 0;
+    check_cuda_error(cudaGetDevice(&current_dev_id));
+    return current_dev_id;
+}
+
+inline int getDeviceCount()
+{
+    int count = 0;
+    check_cuda_error(cudaGetDeviceCount(&count));
+    return count;
+}
+
+/// @brief Identifies the memory type of the given pointer.
+template <typename T>
+cudaMemoryType getPtrCudaMemoryType(T* ptr)
+{
+    cudaPointerAttributes attributes{};
+    check_cuda_error(cudaPointerGetAttributes(&attributes, ptr));
+    return attributes.type;
+}
+
+/// Get the memory info
+/// \return The free and total amount of memory in bytes
+inline std::tuple<size_t, size_t> getDeviceMemoryInfo(bool const useUvm)
+{
+    if (useUvm)
+    {
+        size_t freeSysMem = 0;
+        size_t totalSysMem = 0;
+#ifndef _WIN32 // Linux
+        struct sysinfo info
+        {
+        };
+
+        sysinfo(&info);
+        totalSysMem = info.totalram * info.mem_unit;
+        freeSysMem = info.freeram * info.mem_unit;
+#else  // Windows
+        MEMORYSTATUSEX memInfo;
+        memInfo.dwLength = sizeof(memInfo);
+        GlobalMemoryStatusEx(&memInfo);
+        totalSysMem = memInfo.ullTotalPhys;
+        freeSysMem = memInfo.ullAvailPhys;
+#endif // WIN32
+
+        TLLM_LOG_INFO("Using UVM based system memory for KV cache, total memory %0.2f GB, available memory %0.2f GB",
+            ((double) totalSysMem / 1e9), ((double) freeSysMem / 1e9));
+        return {freeSysMem, totalSysMem};
+    }
+
+    size_t free = 0;
+    size_t total = 0;
+    check_cuda_error(cudaMemGetInfo(&free, &total));
+    TLLM_LOG_DEBUG("Using GPU memory for KV cache, total memory %0.2f GB, available memory %0.2f GB",
+        ((double) total / 1e9), ((double) free / 1e9));
+    return {free, total};
+}
+
+/// @brief Gets the memory allocation granularity for the current device.
+///
+/// @return size_t The size of the smallest difference in memory size supported by the current device.
+inline size_t getAllocationGranularity()
+{
+    auto const currentDevice = getDevice();
+    ::CUmemAllocationProp prop = {};
+
+    prop.type = ::CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = ::CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.id = currentDevice;
+    prop.requestedHandleTypes = ::CU_MEM_HANDLE_TYPE_NONE;
+
+    // Get the minimum granularity supported for allocation with cuMemCreate()
+    size_t granularity = 0;
+    TLLM_CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    return granularity;
+}
+
+inline int getMultiProcessorCount()
+{
+    int device_id = 0;
+    int multi_processor_count = 0;
+    check_cuda_error(cudaGetDevice(&device_id));
+    check_cuda_error(cudaDeviceGetAttribute(&multi_processor_count, cudaDevAttrMultiProcessorCount, device_id));
+    return multi_processor_count;
+}
+
+inline int getMaxSharedMemoryPerBlockOptin()
+{
+    int device_id = 0;
+    int max_shared_memory_per_block = 0;
+    check_cuda_error(cudaGetDevice(&device_id));
+    check_cuda_error(
+        cudaDeviceGetAttribute(&max_shared_memory_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id));
+    return max_shared_memory_per_block;
+}
+
+template <typename T1, typename T2>
+inline size_t divUp(const T1& a, const T2& n)
+{
+    auto const tmp_a = static_cast<size_t>(a);
+    auto const tmp_n = static_cast<size_t>(n);
+    return (tmp_a + tmp_n - 1) / tmp_n;
+}
+
+inline int roundUp(int a, int n)
+{
+    return divUp(a, n) * n;
+}
+
+template <typename T, typename U, typename = std::enable_if_t<std::is_integral<T>::value>,
+    typename = std::enable_if_t<std::is_integral<U>::value>>
+auto constexpr ceilDiv(T numerator, U denominator)
+{
+    return (numerator + denominator - 1) / denominator;
+}
+
+template <typename T>
+void printAbsMean(T const* buf, uint64_t size, cudaStream_t stream, std::string name = "")
+{
+    if (buf == nullptr)
+    {
+        TLLM_LOG_WARNING("%s is an nullptr, skip!", name.c_str());
+        return;
+    }
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    T* h_tmp = new T[size];
+    cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    double sum = 0.0f;
+    uint64_t zero_count = 0;
+    float max_val = -1e10;
+    bool find_inf = false;
+    for (uint64_t i = 0; i < size; i++)
+    {
+        if (std::isinf((float) (h_tmp[i])))
+        {
+            find_inf = true;
+            continue;
+        }
+        sum += abs((double) h_tmp[i]);
+        if ((float) h_tmp[i] == 0.0f)
+        {
+            zero_count++;
+        }
+        max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));
+    }
+    TLLM_LOG_INFO("%20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s", name.c_str(), size, sum / size,
+        sum, max_val, find_inf ? "true" : "false");
+    delete[] h_tmp;
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+void printToStream(T const* result, int const size, FILE* strm)
+{
+    bool const split_rows = (strm == stdout);
+    if (result == nullptr)
+    {
+        TLLM_LOG_WARNING("It is an nullptr, skip! \n");
+        return;
+    }
+    T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
+    check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
+    for (int i = 0; i < size; ++i)
+    {
+        fprintf(strm, "%f, ", static_cast<float>(tmp[i]));
+        if (split_rows && ((i + 1) % 10) == 0)
+            fprintf(strm, "\n");
+    }
+    if (!split_rows || (size % 10) != 0)
+    {
+        fprintf(strm, "\n");
+    }
+    free(tmp);
+}
+
+template <typename T>
+void printToScreen(T const* result, int const size)
+{
+    printToStream(result, size, stdout);
+}
+
+template <typename T>
+void print2dToStream(T const* result, int const r, int const c, int const stride, FILE* strm)
+{
+    if (result == nullptr)
+    {
+        TLLM_LOG_WARNING("It is an nullptr, skip! \n");
+        return;
+    }
+    for (int ri = 0; ri < r; ++ri)
+    {
+        T const* ptr = result + ri * stride;
+        printToStream(ptr, c, strm);
+    }
+    fprintf(strm, "\n");
+}
+
+template <typename T>
+void print2dToScreen(T const* result, int const r, int const c, int const stride)
+{
+    print2dToStream(result, r, c, stride, stdout);
+}
+
+template <typename T>
+void print2dToFile(std::string fname, T const* result, int const r, int const c, int const stride)
+{
+    FILE* fp = fopen(fname.c_str(), "wt");
+    if (fp != nullptr)
+    {
+        print2dToStream(result, r, c, stride, fp);
+        fclose(fp);
+    }
+}
+
+inline void print_float_(float x)
+{
+    printf("%7.3f ", x);
+}
+
+inline void print_element_(float x)
+{
+    print_float_(x);
+}
+
+inline void print_element_(half x)
+{
+    print_float_((float) x);
+}
+
+#ifdef ENABLE_BF16
+inline void print_element_(__nv_bfloat16 x)
+{
+    print_float_((float) x);
+}
+#endif
+
+#ifdef ENABLE_FP8
+inline void print_element_(__nv_fp8_e4m3 x)
+{
+    print_float_((float) x);
+}
+#endif
+
+inline void print_element_(uint32_t ul)
+{
+    printf("%7" PRIu32, ul);
+}
+
+inline void print_element_(uint64_t ull)
+{
+    printf("%7" PRIu64, ull);
+}
+
+inline void print_element_(int32_t il)
+{
+    printf("%7" PRId32, il);
+}
+
+inline void print_element_(int64_t ill)
+{
+    printf("%7" PRId64, ill);
+}
+
+template <typename T>
+inline void printMatrix(T const* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    T* tmp;
+    if (is_device_ptr)
+    {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else
+    {
+        tmp = const_cast<T*>(ptr);
+    }
+
+    for (int ii = -1; ii < m; ++ii)
+    {
+        if (ii >= 0)
+        {
+            printf("%07d ", ii);
+        }
+        else
+        {
+            printf("   ");
+        }
+
+        for (int jj = 0; jj < k; jj += 1)
+        {
+            if (ii >= 0)
+            {
+                print_element_(tmp[ii * stride + jj]);
+            }
+            else
+            {
+                printf("%7d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr)
+    {
+        free(tmp);
+    }
+}
+
+template void printMatrix(float const* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(half const* ptr, int m, int k, int stride, bool is_device_ptr);
+#ifdef ENABLE_BF16
+template void printMatrix(__nv_bfloat16 const* ptr, int m, int k, int stride, bool is_device_ptr);
+#endif
+#ifdef ENABLE_FP8
+template void printMatrix(__nv_fp8_e4m3 const* ptr, int m, int k, int stride, bool is_device_ptr);
+#endif
+template void printMatrix(uint32_t const* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(uint64_t const* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(int const* ptr, int m, int k, int stride, bool is_device_ptr);
+
+} // namespace tensorrt_llm::common
+
+/*
+ * Macros compliant with TensorRT coding conventions
+ */
+#define TLLM_CUDA_CHECK(stat)                                                                                          \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        tensorrt_llm::common::check((stat), #stat, __FILE__, __LINE__);                                                \
+    } while (0)
+
+// We use singleton memory pool and the order of destructors depends on the compiler implementation. We find that the
+// cudaFree/cudaFreeHost is called after cudaruntime destruction on Windows. There will be an cudaErrorCudartUnloading
+// error.  However, it is safe to ignore this error because the cuda runtime is already exited, we are no more worried
+// about the memory leaks.
+#define TLLM_CUDA_CHECK_FREE_RESOURCE(stat)                                                                            \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        tensorrt_llm::common::checkEx((stat), {cudaSuccess, cudaErrorCudartUnloading}, #stat, __FILE__, __LINE__);     \
+    } while (0)
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h
deleted file mode 100644
index d7bf43b40..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/customAllReduceUtils.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cstddef>
-
-namespace tensorrt_llm::utils::customAllReduceUtils
-{
-
-constexpr size_t NUM_POINTERS_PER_RANK = 7;
-
-// WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py
-inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
-{
-    if (worldSize <= 2)
-    {
-        return 16 * 1000 * 1000;
-    }
-    return 8 * 1000 * 1000;
-}
-
-} // namespace tensorrt_llm::utils::customAllReduceUtils
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp
deleted file mode 100644
index 64d3d44ac..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "envUtils.h"
-#include "tensorrt_llm/common/cudaUtils.h"
-#include "tensorrt_llm/common/logger.h"
-#include <cstdlib>
-
-namespace tensorrt_llm::common
-{
-
-std::optional<int32_t> getIntEnv(char const* name)
-{
-    char const* const env = std::getenv(name);
-    if (env == nullptr)
-    {
-        return std::nullopt;
-    }
-    int32_t const val = std::stoi(env);
-    if (val <= 0)
-    {
-        return std::nullopt;
-    }
-    return {val};
-};
-
-// Returns true if the env variable exists and is set to "1"
-static bool getBoolEnv(char const* name)
-{
-    char const* env = std::getenv(name);
-    return env && env[0] == '1' && env[1] == '\0';
-}
-
-// XQA kernels (optimized kernels for generation phase).
-bool forceXQAKernels()
-{
-    static bool const forceXQA = (getIntEnv("TRTLLM_FORCE_XQA").value_or(0) != 0);
-    return forceXQA;
-}
-
-std::optional<bool> getEnvEnableXQAJIT()
-{
-    static bool init = false;
-    static bool exists = false;
-    static bool enableXQAJIT = false;
-    if (!init)
-    {
-        init = true;
-        char const* enable_xqa_jit_var = std::getenv("TRTLLM_ENABLE_XQA_JIT");
-        if (enable_xqa_jit_var)
-        {
-            exists = true;
-            if (enable_xqa_jit_var[0] == '1' && enable_xqa_jit_var[1] == '\0')
-            {
-                enableXQAJIT = true;
-            }
-        }
-    }
-    if (exists)
-    {
-        return enableXQAJIT;
-    }
-    else
-    {
-        return std::nullopt;
-    }
-}
-
-// Tune the number of blocks per sequence for accuracy/performance purpose.
-bool getEnvMmhaMultiblockDebug()
-{
-    static bool init = false;
-    static bool forceMmhaMaxSeqLenTile = false;
-    if (!init)
-    {
-        init = true;
-        char const* enable_mmha_debug_var = std::getenv("TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG");
-        if (enable_mmha_debug_var)
-        {
-            if (enable_mmha_debug_var[0] == '1' && enable_mmha_debug_var[1] == '\0')
-            {
-                forceMmhaMaxSeqLenTile = true;
-            }
-        }
-    }
-    return forceMmhaMaxSeqLenTile;
-}
-
-int getEnvMmhaBlocksPerSequence()
-{
-    static bool init = false;
-    static int mmhaBlocksPerSequence = 0;
-    if (!init)
-    {
-        init = true;
-        char const* mmhaBlocksPerSequenceEnv = std::getenv("TRTLLM_MMHA_BLOCKS_PER_SEQUENCE");
-        if (mmhaBlocksPerSequenceEnv)
-        {
-            mmhaBlocksPerSequence = std::atoi(mmhaBlocksPerSequenceEnv);
-            if (mmhaBlocksPerSequence <= 0)
-            {
-                TLLM_LOG_WARNING("Invalid value for TRTLLM_MMHA_BLOCKS_PER_SEQUENCE. Will use default values instead!");
-            }
-        }
-    }
-    return mmhaBlocksPerSequence;
-}
-
-int getEnvMmhaKernelBlockSize()
-{
-    static bool init = false;
-    static int mmhaKernelBlockSize = 0;
-    if (!init)
-    {
-        init = true;
-        char const* mmhaKernelBlockSizeEnv = std::getenv("TRTLLM_MMHA_KERNEL_BLOCK_SIZE");
-        if (mmhaKernelBlockSizeEnv)
-        {
-            mmhaKernelBlockSize = std::atoi(mmhaKernelBlockSizeEnv);
-            if (mmhaKernelBlockSize <= 0)
-            {
-                TLLM_LOG_WARNING("Invalid value for TRTLLM_MMHA_KERNEL_BLOCK_SIZE. Will use default values instead!");
-            }
-        }
-    }
-    return mmhaKernelBlockSize;
-}
-
-bool getEnvEnablePDL()
-{
-    static bool init = false;
-    static bool enablePDL = false;
-    if (!init)
-    {
-        init = true;
-        // PDL only available when arch >= 90
-        if (getSMVersion() >= 90)
-        {
-            // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
-            enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
-        }
-    }
-    return enablePDL;
-}
-
-bool getEnvUseUCXKvCache()
-{
-    static bool const useUCXKVCache = getBoolEnv("TRTLLM_USE_UCX_KVCACHE");
-    return useUCXKVCache;
-}
-
-std::string getEnvUCXInterface()
-{
-    static bool init = false;
-    static std::string ucxInterface;
-    if (!init)
-    {
-        init = true;
-        {
-            char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE");
-            if (ucx_interface)
-            {
-                ucxInterface = ucx_interface;
-            }
-        }
-    }
-    return ucxInterface;
-}
-
-bool getEnvDisaggLayerwise()
-{
-    static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE");
-    return disaggLayerwise;
-}
-
-bool getEnvParallelCacheSend()
-{
-    static bool const parallelCacheSend = getBoolEnv("TRTLLM_PARALLEL_CACHE_SEND");
-    return parallelCacheSend;
-}
-
-bool getEnvRequestKVCacheSerial()
-{
-    static bool const requestKVCacheSerial = getBoolEnv("TRTLLM_REQUEST_KV_CACHE_SERIAL");
-    return requestKVCacheSerial;
-}
-
-bool getEnvDisableKVCacheTransferOverlap()
-{
-    static bool const disableKVCacheTransferOverlap = getBoolEnv("TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP");
-    return disableKVCacheTransferOverlap;
-}
-
-bool getEnvDisableReceiveKVCacheParallel()
-{
-    static bool const disableReceiveParallel = getBoolEnv("TRTLLM_DISABLE_KVCACHE_RECEIVE_PARALLEL");
-    return disableReceiveParallel;
-}
-
-} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h
deleted file mode 100644
index 027c7cfbb..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/envUtils.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <cstdint>
-#include <optional>
-#include <string>
-
-namespace tensorrt_llm::common
-{
-// Useful when you want to inject some debug code controllable with env var.
-std::optional<int32_t> getIntEnv(char const* name);
-
-// XQA kernels (optimized kernels for generation phase).
-bool forceXQAKernels();
-
-// Whether XQA JIT is enabled.
-//
-// Returns the value of TRTLLM_ENABLE_XQA_JIT env var. If such env var doesn't exist, std::nullopt is returned.
-std::optional<bool> getEnvEnableXQAJIT();
-
-// Tune the number of blocks per sequence for accuracy/performance purpose.
-bool getEnvMmhaMultiblockDebug();
-
-int getEnvMmhaBlocksPerSequence();
-
-int getEnvMmhaKernelBlockSize();
-
-// Whether PDL is enabled.
-bool getEnvEnablePDL();
-
-bool getEnvUseUCXKvCache();
-
-std::string getEnvUCXInterface();
-
-bool getEnvDisaggLayerwise();
-
-bool getEnvParallelCacheSend();
-
-bool getEnvRequestKVCacheSerial();
-
-bool getEnvDisableKVCacheTransferOverlap();
-
-bool getEnvDisableReceiveKVCacheParallel();
-
-} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/logger.h b/sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
new file mode 100644
index 000000000..df84e2263
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/logger.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/stringUtils.h"
+
+namespace tensorrt_llm::common
+{
+
+class Logger
+{
+
+// On Windows, the file wingdi.h is included which has
+// #define ERROR 0
+// This breaks everywhere ERROR is used in the Level enum
+#ifdef _WIN32
+#undef ERROR
+#endif // _WIN32
+
+public:
+    enum Level
+    {
+        TRACE = 0,
+        DEBUG = 10,
+        INFO = 20,
+        WARNING = 30,
+        ERROR = 40
+    };
+
+    static Logger* getLogger();
+
+    Logger(Logger const&) = delete;
+    void operator=(Logger const&) = delete;
+
+#if defined(_MSC_VER)
+    template <typename... Args>
+    void log(Level level, char const* format, Args const&... args);
+
+    template <typename... Args>
+    void log(Level level, int rank, char const* format, Args const&... args);
+#else
+    template <typename... Args>
+    void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0)));
+
+    template <typename... Args>
+    void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0)));
+#endif
+
+    template <typename... Args>
+    void log(Level level, std::string const& format, Args const&... args)
+    {
+        return log(level, format.c_str(), args...);
+    }
+
+    template <typename... Args>
+    void log(Level const level, int const rank, std::string const& format, Args const&... args)
+    {
+        return log(level, rank, format.c_str(), args...);
+    }
+
+    void log(std::exception const& ex, Level level = Level::ERROR);
+
+    Level getLevel() const
+    {
+        return level_;
+    }
+
+    void setLevel(Level const level)
+    {
+        level_ = level;
+        log(INFO, "Set logger level to %s", getLevelName(level));
+    }
+
+    bool isEnabled(Level const level) const
+    {
+        return level_ <= level;
+    }
+
+private:
+    static auto constexpr kPREFIX = "[TensorRT-LLM]";
+
+#ifndef NDEBUG
+    Level const DEFAULT_LOG_LEVEL = DEBUG;
+#else
+    Level const DEFAULT_LOG_LEVEL = INFO;
+#endif
+    Level level_ = DEFAULT_LOG_LEVEL;
+
+    Logger(); // NOLINT(modernize-use-equals-delete)
+
+    static inline char const* getLevelName(Level const level)
+    {
+        switch (level)
+        {
+        case TRACE: return "TRACE";
+        case DEBUG: return "DEBUG";
+        case INFO: return "INFO";
+        case WARNING: return "WARNING";
+        case ERROR: return "ERROR";
+        }
+
+        TLLM_THROW("Unknown log level: %d", level);
+    }
+
+    static inline std::string getPrefix(Level const level)
+    {
+        return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
+    }
+
+    static inline std::string getPrefix(Level const level, int const rank)
+    {
+        return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
+    }
+};
+
+template <typename... Args>
+void Logger::log(Logger::Level level, char const* format, Args const&... args)
+{
+    if (isEnabled(level))
+    {
+        auto const fmt = getPrefix(level) + format;
+        auto& out = level_ < WARNING ? std::cout : std::cerr;
+        if constexpr (sizeof...(args) > 0)
+        {
+            out << fmtstr(fmt.c_str(), args...);
+        }
+        else
+        {
+            out << fmt;
+        }
+        out << std::endl;
+    }
+}
+
+template <typename... Args>
+void Logger::log(Logger::Level const level, int const rank, char const* format, Args const&... args)
+{
+    if (isEnabled(level))
+    {
+        auto const fmt = getPrefix(level, rank) + format;
+        auto& out = level_ < WARNING ? std::cout : std::cerr;
+        if constexpr (sizeof...(args) > 0)
+        {
+            out << fmtstr(fmt.c_str(), args...);
+        }
+        else
+        {
+            out << fmt;
+        }
+        out << std::endl;
+    }
+}
+
+#define TLLM_LOG(level, ...)                                                                                           \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        auto* const logger = tensorrt_llm::common::Logger::getLogger();                                                \
+        if (logger->isEnabled(level))                                                                                  \
+        {                                                                                                              \
+            logger->log(level, __VA_ARGS__);                                                                           \
+        }                                                                                                              \
+    } while (0)
+
+#define TLLM_LOG_TRACE(...) TLLM_LOG(tensorrt_llm::common::Logger::TRACE, __VA_ARGS__)
+#define TLLM_LOG_DEBUG(...) TLLM_LOG(tensorrt_llm::common::Logger::DEBUG, __VA_ARGS__)
+#define TLLM_LOG_INFO(...) TLLM_LOG(tensorrt_llm::common::Logger::INFO, __VA_ARGS__)
+#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
+#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
+#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
+} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h
deleted file mode 100644
index 1bad3a2c1..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/mathUtils.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda_runtime.h>
-
-namespace tensorrt_llm
-{
-namespace common
-{
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-inline __device__ __host__ T divUp(T m, T n)
-{
-    return (m + n - 1) / n;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace common
-} // namespace tensorrt_llm
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu b/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu
deleted file mode 100644
index d13217b20..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.cu
+++ /dev/null
@@ -1,906 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/cudaTypeUtils.cuh"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/common/memoryUtils.h"
-
-#include <curand_kernel.h>
-#include <sys/stat.h>
-#include <unordered_map>
-
-namespace tensorrt_llm
-{
-namespace common
-{
-
-template <typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
-{
-    check_cuda_error(cudaMalloc((void**) (ptr), sizeof(T) * size));
-    if (is_random_initialize)
-    {
-        cudaRandomUniform(*ptr, size);
-    }
-}
-
-template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
-#ifdef ENABLE_BF16
-template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
-#endif
-template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
-#ifdef ENABLE_FP8
-template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
-#endif
-
-template <typename T>
-void deviceMemSetZero(T* ptr, size_t size)
-{
-    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
-}
-
-template void deviceMemSetZero(float* ptr, size_t size);
-template void deviceMemSetZero(half* ptr, size_t size);
-template void deviceMemSetZero(int* ptr, size_t size);
-template void deviceMemSetZero(uint32_t* ptr, size_t size);
-template void deviceMemSetZero(bool* ptr, size_t size);
-#ifdef ENABLE_FP8
-template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
-#endif
-#ifdef ENABLE_BF16
-template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
-#endif
-
-template <typename T>
-void deviceFree(T*& ptr)
-{
-    if (ptr != NULL)
-    {
-        check_cuda_error(cudaFree(ptr));
-        ptr = NULL;
-    }
-}
-
-template void deviceFree(float*& ptr);
-template void deviceFree(half*& ptr);
-#ifdef ENABLE_BF16
-template void deviceFree(__nv_bfloat16*& ptr);
-#endif
-template void deviceFree(unsigned short*& ptr);
-template void deviceFree(int*& ptr);
-template void deviceFree(bool*& ptr);
-template void deviceFree(char*& ptr);
-template void deviceFree(int8_t*& ptr);
-#ifdef ENABLE_FP8
-template void deviceFree(__nv_fp8_e4m3*& ptr);
-#endif
-
-template <typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
-{
-    T* arr = new T[size];
-    std::fill(arr, arr + size, value);
-    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
-    delete[] arr;
-}
-
-template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
-template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream);
-#endif
-template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream);
-template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream);
-
-template <typename T>
-void cudaD2Hcpy(T* tgt, T const* src, const size_t size)
-{
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost));
-}
-
-template void cudaD2Hcpy(float* tgt, float const* src, size_t size);
-template void cudaD2Hcpy(half* tgt, half const* src, size_t size);
-#ifdef ENABLE_BF16
-template void cudaD2Hcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size);
-#endif
-template void cudaD2Hcpy(int* tgt, int const* src, size_t size);
-template void cudaD2Hcpy(bool* tgt, bool const* src, size_t size);
-#ifdef ENABLE_FP8
-template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size);
-#endif
-template void cudaD2Hcpy(unsigned long long* tgt, unsigned long long const* src, size_t size);
-template void cudaD2Hcpy(unsigned int* tgt, unsigned int const* src, size_t size);
-template void cudaD2Hcpy(int8_t* tgt, int8_t const* src, size_t size);
-
-template <typename T>
-void cudaH2Dcpy(T* tgt, T const* src, const size_t size)
-{
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
-}
-
-template void cudaH2Dcpy(float* tgt, float const* src, size_t size);
-template void cudaH2Dcpy(half* tgt, half const* src, size_t size);
-#ifdef ENABLE_BF16
-template void cudaH2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size);
-#endif
-template void cudaH2Dcpy(int* tgt, int const* src, size_t size);
-template void cudaH2Dcpy(bool* tgt, bool const* src, size_t size);
-#ifdef ENABLE_FP8
-template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size);
-#endif
-template void cudaH2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size);
-template void cudaH2Dcpy(unsigned int* tgt, unsigned int const* src, size_t size);
-template void cudaH2Dcpy(int8_t* tgt, int8_t const* src, size_t size);
-
-template <typename T>
-void cudaD2Dcpy(T* tgt, T const* src, const size_t size, cudaStream_t stream)
-{
-    check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice, stream));
-}
-
-template void cudaD2Dcpy(float* tgt, float const* src, size_t size, cudaStream_t stream);
-template void cudaD2Dcpy(half* tgt, half const* src, size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void cudaD2Dcpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream);
-#endif
-template void cudaD2Dcpy(int* tgt, int const* src, size_t size, cudaStream_t stream);
-template void cudaD2Dcpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream);
-template void cudaD2Dcpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream);
-#ifdef ENABLE_FP8
-template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, __nv_fp8_e4m3 const* src, size_t size, cudaStream_t stream);
-#endif
-template void cudaD2Dcpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream);
-
-template <typename T_OUT, typename T_IN>
-__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = (T_OUT) ((float) (src[tid]));
-    }
-}
-
-template <typename T_OUT, typename T_IN>
-void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
-{
-    cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(
-    __nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(
-    __nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream);
-#endif
-
-template <typename T>
-void cudaAutoCpy(T* tgt, T const* src, const size_t size, cudaStream_t stream)
-{
-    if (stream != NULL)
-    {
-        check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream));
-    }
-    else
-    {
-        check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault));
-    }
-}
-
-template void cudaAutoCpy(float* tgt, float const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(half* tgt, half const* src, size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void cudaAutoCpy(__nv_bfloat16* tgt, __nv_bfloat16 const* src, size_t size, cudaStream_t stream);
-#endif
-template void cudaAutoCpy(int* tgt, int const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(bool* tgt, bool const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(int8_t* tgt, int8_t const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(uint8_t* tgt, uint8_t const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(uint32_t* tgt, uint32_t const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(unsigned long long* tgt, unsigned long long const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(unsigned long* tgt, unsigned long const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(char* tgt, char const* src, size_t size, cudaStream_t stream);
-
-template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream);
-#endif
-template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(
-    unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream);
-
-template <typename T>
-__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, int const seq_offset)
-{
-    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((unsigned long long int) 1337, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
-    {
-        buffer[index] = (T) (curand_uniform(&local_state) * 0.2f - 0.1f);
-    }
-}
-
-template <>
-__global__ void cuda_random_uniform_kernel<int>(int* buffer, const size_t size, int const seq_offset)
-{
-    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
-    {
-        buffer[index] = curand(&local_state);
-    }
-}
-
-template <>
-__global__ void cuda_random_uniform_kernel<bool>(bool* buffer, const size_t size, int const seq_offset)
-{
-    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
-    {
-        buffer[index] = (curand(&local_state) % 2 == 0);
-    }
-}
-
-template <>
-__global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size, int const seq_offset)
-{
-    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((float) 1337.f, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x)
-    {
-        buffer[index] = curand(&local_state) % 0xFF;
-    }
-}
-
-template <typename T>
-void cudaRandomUniform(T* buffer, const size_t size)
-{
-    static int seq_offset = 0;
-    cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
-    seq_offset += 256 * 256;
-}
-
-template void cudaRandomUniform(float* buffer, const size_t size);
-template void cudaRandomUniform(half* buffer, const size_t size);
-#ifdef ENABLE_BF16
-template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
-#endif
-template void cudaRandomUniform(int* buffer, const size_t size);
-template void cudaRandomUniform(bool* buffer, const size_t size);
-template void cudaRandomUniform(char* buffer, const size_t size);
-#ifdef ENABLE_FP8
-template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
-#endif
-
-// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
-// the product of the elements in shape is 0, this function will return an empty vector.
-template <typename T>
-std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
-{
-    if (shape.size() > 2)
-    {
-        printf("[ERROR] shape should have less than two dims \n");
-        return std::vector<T>();
-    }
-    size_t dim0 = shape[0], dim1 = 1;
-    if (shape.size() == 2)
-    {
-        dim1 = shape[1];
-    }
-    size_t size = dim0 * dim1;
-    if (size == 0)
-    {
-        TLLM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
-        return std::vector<T>();
-    }
-
-    std::vector<T> host_array(size);
-    std::ifstream in(filename, std::ios::in | std::ios::binary);
-    if (!in.is_open())
-    {
-        TLLM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
-        return std::vector<T>();
-    }
-
-    size_t loaded_data_size = sizeof(T) * size;
-    in.seekg(0, in.end);
-    in.seekg(0, in.beg);
-
-    TLLM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
-    in.read((char*) host_array.data(), loaded_data_size);
-
-    size_t in_get_size = in.gcount();
-    if (in_get_size != loaded_data_size)
-    {
-        TLLM_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n", filename.c_str(),
-            in_get_size, loaded_data_size);
-        return std::vector<T>();
-    }
-    in.close();
-    // If we succeed, return an array with values.
-    return host_array;
-}
-
-template <typename T, typename T_IN>
-int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filename)
-{
-    std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename);
-
-    if (host_array.empty())
-    {
-        return 0;
-    }
-
-    if (std::is_same<T, T_IN>::value == true)
-    {
-        cudaH2Dcpy(ptr, (T*) host_array.data(), host_array.size());
-    }
-    else
-    {
-        T_IN* ptr_2 = nullptr;
-        deviceMalloc(&ptr_2, host_array.size(), false);
-        cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
-        invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
-        deviceFree(ptr_2);
-    }
-    return 0;
-}
-
-template int loadWeightFromBinFunc<float, float>(float* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<half, float>(half* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<float, half>(float* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<half, half>(half* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr, std::vector<size_t> shape, std::string filename);
-#ifdef ENABLE_BF16
-template int loadWeightFromBinFunc<__nv_bfloat16, float>(
-    __nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<__nv_bfloat16, half>(
-    __nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(
-    __nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
-#endif // ENABLE_BF16
-template int loadWeightFromBinFunc<int, int>(int* ptr, std::vector<size_t> shape, std::string filename);
-#ifdef ENABLE_FP8
-template int loadWeightFromBinFunc<__nv_fp8_e4m3, float>(
-    __nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename);
-#endif // ENABLE_FP8
-
-template <typename T>
-int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type)
-{
-    switch (model_file_type)
-    {
-    case TRTLLMCudaDataType::FP32: loadWeightFromBinFunc<T, float>(ptr, shape, filename); break;
-    case TRTLLMCudaDataType::FP16: loadWeightFromBinFunc<T, half>(ptr, shape, filename); break;
-    case TRTLLMCudaDataType::INT8: loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename); break;
-#ifdef ENABLE_BF16
-    case TRTLLMCudaDataType::BF16: loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename); break;
-#endif
-#ifdef ENABLE_FP8
-    case TRTLLMCudaDataType::FP8: loadWeightFromBinFunc<T, float>(ptr, shape, filename); break;
-#endif
-    default: TLLM_LOG_ERROR("Does not support TRTLLMCudaDataType=%d", model_file_type); TLLM_CHECK(false);
-    }
-    return 0;
-}
-
-template <>
-int loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type)
-{
-    loadWeightFromBinFunc<int, int>(ptr, shape, filename);
-    return 0;
-}
-
-template int loadWeightFromBin(
-    float* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
-template int loadWeightFromBin(
-    half* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
-template int loadWeightFromBin(
-    int8_t* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
-#ifdef ENABLE_BF16
-template int loadWeightFromBin(
-    __nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
-#endif
-#ifdef ENABLE_FP8
-template int loadWeightFromBin(
-    __nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
-#endif
-template int loadWeightFromBin(
-    int* ptr, std::vector<size_t> shape, std::string filename, TRTLLMCudaDataType model_file_type);
-
-template <typename T_IN, typename T_OUT>
-__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = cuda_cast<T_OUT>(src[tid]);
-    }
-}
-
-template <typename T_IN, typename T_OUT>
-void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size);
-}
-
-template void invokeCudaD2DcpyConvert(int8_t* tgt, float const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, int8_t const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, int const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(half* tgt, int const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, float const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(half* tgt, float const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, half const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(uint32_t* tgt, int const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, uint32_t const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, float const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, half const* src, const size_t size, cudaStream_t stream);
-
-#ifdef ENABLE_BF16
-template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, float const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, int const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, __nv_bfloat16 const* src, const size_t size, cudaStream_t stream);
-#endif // ENABLE_BF16
-
-template <typename T_IN, typename T_OUT>
-__global__ void cudaD2DScaleCpyConvert(
-    T_OUT* dst, const T_IN* src, float const* scale, bool invert_scale, const size_t size)
-{
-    float const scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
-    }
-}
-
-template <typename T_IN, typename T_OUT>
-void invokeCudaD2DScaleCpyConvert(
-    T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, const size_t size, cudaStream_t stream)
-{
-    cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
-}
-
-// clang-format off
-template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_BF16
-#ifdef ENABLE_FP8
-template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_FP8
-// clang-format on
-
-void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
-{
-    invokeCudaD2DcpyConvert(dst, src, size, stream);
-}
-
-void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
-{
-    invokeCudaD2DcpyConvert(dst, src, size, stream);
-}
-
-template <typename T>
-void saveToBinary(T const* ptr, const size_t size, std::string filename)
-{
-
-    std::vector<T> h_ptr(size);
-    cudaD2Hcpy(h_ptr.data(), ptr, size);
-    std::vector<float> float_ptr(size);
-    for (size_t i = 0; i < size; i++)
-    {
-        float_ptr[i] = (float) h_ptr[i];
-    }
-
-    std::ofstream out(filename, std::ios::out | std::ios::binary);
-    TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
-
-    out.write((char*) float_ptr.data(), size * sizeof(float));
-}
-
-template void saveToBinary(float const* ptr, const size_t size, std::string filename);
-template void saveToBinary(half const* ptr, const size_t size, std::string filename);
-#ifdef ENABLE_BF16
-template void saveToBinary(__nv_bfloat16 const* ptr, const size_t size, std::string filename);
-#endif // ENABLE_BF16
-
-template <>
-void saveToBinary(int const* ptr, const size_t size, std::string filename)
-{
-    std::vector<int> h_ptr(size);
-    cudaD2Hcpy(h_ptr.data(), ptr, size);
-    std::ofstream out(filename, std::ios::out | std::ios::binary);
-    TLLM_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
-    out.write((char*) h_ptr.data(), size * sizeof(int));
-}
-
-template <typename T_IN, typename T_fake_type>
-__global__ void fakeCast(T_IN* input_ptr, const size_t size)
-{
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
-    {
-        T_fake_type tmp_val = (T_fake_type) ((float) input_ptr[i]);
-        input_ptr[i] = (T_IN) ((float) tmp_val);
-    }
-}
-
-template <typename T_IN, typename T_fake_type>
-void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
-{
-    dim3 block(256);
-    dim3 grid((size + 255) / 256);
-    fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
-}
-
-#ifdef ENABLE_FP8
-__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = (float) (src[tid]);
-    }
-}
-
-void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = (half) ((float) (src[tid]));
-    }
-}
-
-void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = (__nv_fp8_e4m3) src[tid];
-    }
-}
-
-void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = (__nv_fp8_e4m3) src[tid];
-    }
-}
-
-void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x)
-    {
-        dst[tid] = (__nv_fp8_e4m3) src[tid];
-    }
-}
-
-void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-#endif // ENABLE_FP8
-
-template <typename T_OUT, typename T_IN>
-__global__ void transpose(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x)
-    {
-        const size_t src_col_id = tid % dim1;
-        const size_t src_row_id = tid / dim1;
-        dst[src_col_id * dim0 + src_row_id] = (T_OUT) (src[tid]);
-    }
-}
-
-template <typename T>
-void invokeInPlaceTranspose(T* data, T* workspace, const size_t dim0, const size_t dim1)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    cudaD2Dcpy(workspace, data, dim0 * dim1);
-    transpose<<<256, 256>>>(data, workspace, dim0, dim1);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose(
-    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1);
-#endif // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose(
-    __nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1);
-#endif // ENABLE_BF16
-template void invokeInPlaceTranspose(float* data, float* workspace, const size_t dim0, const size_t dim1);
-
-template <typename T_OUT, typename T_IN>
-__global__ void transpose0213(
-    T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3)
-{
-    // src permutation: [0, 1, 2, 3]
-    // dst permutation: [0, 2, 1, 3]
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
-         tid += blockDim.x * gridDim.x)
-    {
-        size_t tmp_idx = tid;
-        const size_t dim_3_idx = tmp_idx % dim3;
-        tmp_idx = (tmp_idx - dim_3_idx) / dim3;
-        const size_t dim_2_idx = tmp_idx % dim2;
-        tmp_idx = (tmp_idx - dim_2_idx) / dim2;
-        const size_t dim_1_idx = tmp_idx % dim1;
-        tmp_idx = (tmp_idx - dim_1_idx) / dim1;
-        const size_t dim_0_idx = tmp_idx % dim0;
-        dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
-    }
-}
-
-template <typename T>
-void invokeInPlaceTranspose0213(
-    T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    // Note that this kernel is used for pre-processing and not very efficient.
-    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
-    transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose0213(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0,
-    const size_t dim1, const size_t dim2, const size_t dim3);
-#endif // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose0213(__nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0,
-    const size_t dim1, const size_t dim2, const size_t dim3);
-#endif // ENABLE_BF16
-template void invokeInPlaceTranspose0213(
-    float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2, const size_t dim3);
-
-template <typename T_OUT, typename T_IN>
-__global__ void transpose102(T_OUT* dst, T_IN* src, const size_t dim0, const size_t dim1, const size_t dim2)
-{
-    // src permutation: [0, 1, 2]
-    // dst permutation: [1, 0, 2]
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x)
-    {
-        size_t tmp_idx = tid;
-        const size_t dim_2_idx = tmp_idx % dim2;
-        tmp_idx = (tmp_idx - dim_2_idx) / dim2;
-        const size_t dim_1_idx = tmp_idx % dim1;
-        tmp_idx = (tmp_idx - dim_1_idx) / dim1;
-        const size_t dim_0_idx = tmp_idx % dim0;
-        dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid];
-    }
-}
-
-template <typename T>
-void invokeInPlaceTranspose102(T* data, T* workspace, const size_t dim0, const size_t dim1, const size_t dim2)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    // Note that this kernel is used for pre-processing and not very efficient.
-    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2);
-    transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose102(
-    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
-#endif // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose102(
-    __nv_bfloat16* data, __nv_bfloat16* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
-#endif // ENABLE_BF16
-template void invokeInPlaceTranspose102(
-    float* data, float* workspace, const size_t dim0, const size_t dim1, const size_t dim2);
-
-template <typename T>
-void __global__ multiplyScale(T* tensor, float scale, const size_t size)
-{
-    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x)
-    {
-        tensor[index] = (T) (((float) tensor[index]) * scale);
-    }
-}
-
-template <typename T>
-void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
-{
-    int block = 256;
-    int grid = (size + 255) / 256;
-    multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
-}
-
-template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
-template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-
-template <typename T>
-void __global__ divideScale(T* tensor, float scale, const size_t size)
-{
-    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x)
-    {
-        tensor[index] = (T) (((float) tensor[index]) / scale);
-    }
-}
-
-template <typename T>
-void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
-{
-    int block = 256;
-    int grid = (size + 255) / 256;
-    divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
-}
-
-template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
-template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_BF16
-template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(
-    __nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
-#endif
-template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_FP8
-template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(
-    __nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
-#endif
-
-size_t cuda_datatype_size(TRTLLMCudaDataType dt)
-{
-    static const std::unordered_map<TRTLLMCudaDataType, size_t> sizes{
-        {TRTLLMCudaDataType::FP32, sizeof(float)}, {TRTLLMCudaDataType::FP16, sizeof(half)}
-#ifdef ENABLE_BF16
-        ,
-        {TRTLLMCudaDataType::BF16, sizeof(__nv_bfloat16)}
-#endif
-    };
-
-    return sizes.at(dt);
-}
-
-template <typename T>
-__global__ void check_range(T const* buffer, size_t size, T min, T max, bool* d_within_range)
-{
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
-    {
-        const T val = buffer[i];
-        if (val < min || val > max)
-        {
-            *d_within_range = false;
-        }
-    }
-}
-
-template <typename T>
-bool invokeCheckRange(T const* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
-{
-    cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
-
-    dim3 block(256);
-    dim3 grid((size + 255) / 256);
-    check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
-
-    bool result;
-    cudaD2Hcpy(&result, d_within_range, 1);
-    return result;
-}
-
-template bool invokeCheckRange<int>(
-    int const* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
-
-/*
- *  Determine the total workspace size based on a vector containing multiple variable sizes.
- */
-size_t calcAlignedSize(std::vector<size_t> const& sizes, const size_t ALIGN_BYTES)
-{
-    const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);
-    // Check ALIGN_BYTES is a power of 2
-    assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0);
-
-    size_t total = 0;
-    for (auto sz : sizes)
-    {
-        total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-    }
-
-    // We add extra "ALIGN_BYTES - 1" bytes in case the start address passed to the function calcAlignedPointers() is
-    // not aligned.
-    return total + ALIGN_BYTES - 1;
-}
-
-/*
- * Given the address of the workspace and the vector containing multiple variable sizes, calculate the start addresses
- * of each variable.
- */
-void calcAlignedPointers(
-    std::vector<void*>& outPtrs, void const* p, std::vector<size_t> const& sizes, size_t ALIGN_BYTES)
-{
-    const size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);
-    // Check ALIGN_BYTES is a power of 2
-    assert((ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0);
-
-    // In case the start address is not aligned
-    char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
-
-    outPtrs.reserve(sizes.size());
-    for (auto sz : sizes)
-    {
-        outPtrs.push_back(ptr);
-        ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-    }
-}
-
-} // namespace common
-} // namespace tensorrt_llm
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h
deleted file mode 100644
index 9e413a1be..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/memoryUtils.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/common/cudaFp8Utils.h"
-#include "tensorrt_llm/common/cudaUtils.h"
-
-#include <cassert>
-
-namespace tensorrt_llm
-{
-namespace common
-{
-
-template <typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
-
-template <typename T>
-void deviceMemSetZero(T* ptr, size_t size);
-
-template <typename T>
-
-void deviceFree(T*& ptr);
-
-template <typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
-
-template <typename T>
-void cudaD2Hcpy(T* tgt, T const* src, size_t const size);
-
-template <typename T>
-void cudaH2Dcpy(T* tgt, T const* src, size_t const size);
-
-template <typename T>
-void cudaD2Dcpy(T* tgt, T const* src, size_t const size, cudaStream_t stream = NULL);
-
-template <typename T>
-void cudaAutoCpy(T* tgt, T const* src, size_t const size, cudaStream_t stream = NULL);
-
-template <typename T>
-void cudaRandomUniform(T* buffer, size_t const size);
-
-template <typename T>
-int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename,
-    TRTLLMCudaDataType model_file_type = TRTLLMCudaDataType::FP32);
-
-// template<typename T>
-// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t*             quantized_weight_ptr,
-//                                               T*                  scale_ptr,
-//                                               std::vector<size_t> shape,
-//                                               std::string         filename,
-//                                               TRTLLMCudaDataType  model_file_type = TRTLLMCudaDataType::FP32);
-
-void invokeCudaD2DcpyHalf2Float(float* dst, half* src, size_t const size, cudaStream_t stream);
-void invokeCudaD2DcpyFloat2Half(half* dst, float* src, size_t const size, cudaStream_t stream);
-#ifdef ENABLE_FP8
-void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, size_t const size, cudaStream_t stream);
-void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, size_t const size, cudaStream_t stream);
-void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, size_t const size, cudaStream_t stream);
-void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, size_t const size, cudaStream_t stream);
-void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, size_t const size, cudaStream_t stream);
-#endif // ENABLE_FP8
-#ifdef ENABLE_BF16
-void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, size_t const size, cudaStream_t stream);
-#endif // ENABLE_BF16
-
-template <typename T_OUT, typename T_IN>
-void invokeCudaCast(T_OUT* dst, T_IN const* const src, size_t const size, cudaStream_t stream);
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The following functions implement conversion of multi-dimensional indices to an index in a flat array.
-// The shape of the Tensor dimensions is passed as one array (`dims`), the indices are given as individual arguments.
-// For examples on how to use these functions, see their tests `test_memory_utils.cu`.
-// All of these functions can be evaluated at compile time by recursive template expansion.
-
-template <typename TDim, typename T, typename TIndex>
-__inline__ __host__ __device__ std::enable_if_t<std::is_pointer<TDim>::value, T> constexpr flat_index(
-    T const& acc, TDim dims, TIndex const& index)
-{
-    assert(index < dims[0]);
-    return acc * dims[0] + index;
-}
-
-template <typename TDim, typename T, typename TIndex, typename... TIndices>
-__inline__ __host__ __device__ std::enable_if_t<std::is_pointer<TDim>::value, T> constexpr flat_index(
-    T const& acc, TDim dims, TIndex const& index, TIndices... indices)
-{
-    assert(index < dims[0]);
-    return flat_index(acc * dims[0] + index, dims + 1, indices...);
-}
-
-template <typename TDim, typename T>
-__inline__ __host__ __device__ std::enable_if_t<std::is_pointer<TDim>::value, T> constexpr flat_index(
-    [[maybe_unused]] TDim dims, T const& index)
-{
-    assert(index < dims[0]);
-    return index;
-}
-
-template <typename TDim, typename TIndex, typename... TIndices>
-__inline__ __host__ __device__
-    std::enable_if_t<std::is_pointer<TDim>::value, typename std::remove_pointer<TDim>::type> constexpr flat_index(
-        TDim dims, TIndex const& index, TIndices... indices)
-{
-    assert(index < dims[0]);
-    return flat_index(static_cast<typename std::remove_pointer<TDim>::type>(index), dims + 1, indices...);
-}
-
-template <unsigned skip = 0, typename T, std::size_t N, typename TIndex, typename... TIndices>
-__inline__ __host__ __device__ T constexpr flat_index(
-    std::array<T, N> const& dims, TIndex const& index, TIndices... indices)
-{
-    static_assert(skip < N);
-    static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
-    return flat_index(&dims[skip], index, indices...);
-}
-
-template <unsigned skip = 0, typename T, typename TIndex, std::size_t N, typename... TIndices>
-__inline__ __host__ __device__ T constexpr flat_index(
-    T const& acc, std::array<T, N> const& dims, TIndex const& index, TIndices... indices)
-{
-    static_assert(skip < N);
-    static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
-    return flat_index(acc, &dims[skip], index, indices...);
-}
-
-template <unsigned skip = 0, typename T, typename TIndex, std::size_t N, typename... TIndices>
-__inline__ __host__ __device__ T constexpr flat_index(T const (&dims)[N], TIndex const& index, TIndices... indices)
-{
-    static_assert(skip < N);
-    static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
-    return flat_index(static_cast<T const*>(dims) + skip, index, indices...);
-}
-
-template <unsigned skip = 0, typename T, typename TIndex, std::size_t N, typename... TIndices>
-__inline__ __host__ __device__ T constexpr flat_index(
-    T const& acc, T const (&dims)[N], TIndex const& index, TIndices... indices)
-{
-    static_assert(skip < N);
-    static_assert(sizeof...(TIndices) < N - skip, "Number of indices exceeds number of dimensions");
-    return flat_index(acc, static_cast<T const*>(dims) + skip, index, indices...);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// These are simpler functions for multi-dimensional index conversion. Indices and dimensions are passed as individual
-// arguments. These functions are more suitable for usage inside kernels than the corresponding flat_index functions
-// which require arrays as arguments. Usage examples can be found in `test_memory_utils.cu`. The functions can be
-// evaluated at compile time.
-
-template <typename T, typename TIndex>
-__inline__ __host__ __device__ T constexpr flat_index2(TIndex const& index_0, TIndex const& index_1, T const& dim_1)
-{
-    assert(index_1 < dim_1);
-    return index_0 * dim_1 + index_1;
-}
-
-template <typename T, typename TIndex>
-__inline__ __host__ __device__ T constexpr flat_index3(
-    TIndex const& index_0, TIndex const& index_1, TIndex const& index_2, T const& dim_1, T const& dim_2)
-{
-    assert(index_2 < dim_2);
-    return flat_index2(index_0, index_1, dim_1) * dim_2 + index_2;
-}
-
-template <typename T, typename TIndex>
-__inline__ __host__ __device__ T constexpr flat_index4(TIndex const& index_0, TIndex const& index_1,
-    TIndex const& index_2, TIndex const& index_3, T const& dim_1, T const& dim_2, T const& dim_3)
-{
-    assert(index_3 < dim_3);
-    return flat_index3(index_0, index_1, index_2, dim_1, dim_2) * dim_3 + index_3;
-}
-
-template <typename T, typename TIndex>
-__inline__ __host__ __device__ T constexpr flat_index5(TIndex const& index_0, TIndex const& index_1,
-    TIndex const& index_2, TIndex const& index_3, TIndex const& index_4, T const& dim_1, T const& dim_2, T const& dim_3,
-    T const& dim_4)
-{
-    assert(index_4 < dim_4);
-    return flat_index4(index_0, index_1, index_2, index_3, dim_1, dim_2, dim_3) * dim_4 + index_4;
-}
-
-template <typename T, typename TIndex>
-__inline__ __host__ __device__ T constexpr flat_index_strided3(
-    TIndex const& index_0, TIndex const& index_1, TIndex const& index_2, T const& stride_1, T const& stride_2)
-{
-    assert(index_1 < stride_1 / stride_2);
-    assert(index_2 < stride_2);
-    return index_0 * stride_1 + index_1 * stride_2 + index_2;
-}
-
-template <typename T, typename TIndex>
-__inline__ __host__ __device__ T constexpr flat_index_strided4(TIndex const& index_0, TIndex const& index_1,
-    TIndex const& index_2, TIndex const& index_3, T const& stride_1, T const& stride_2, T const& stride_3)
-{
-    assert(index_1 < stride_1 / stride_2);
-    assert(index_2 < stride_2 / stride_3);
-    assert(index_3 < stride_3);
-    return index_0 * stride_1 + index_1 * stride_2 + index_2 * stride_3 + index_3;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-void invokeInPlaceTranspose(T* data, T* workspace, size_t const dim0, size_t const dim1);
-
-template <typename T>
-void invokeInPlaceTranspose0213(
-    T* data, T* workspace, size_t const dim0, size_t const dim1, size_t const dim2, size_t const dim3);
-
-template <typename T>
-void invokeInPlaceTranspose102(T* data, T* workspace, size_t const dim0, size_t const dim1, size_t const dim2);
-
-template <typename T>
-void invokeMultiplyScale(T* tensor, float scale, size_t const size, cudaStream_t stream);
-
-template <typename T>
-void invokeDivideScale(T* tensor, float scale, size_t const size, cudaStream_t stream);
-
-template <typename T_IN, typename T_OUT>
-void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, size_t const size, cudaStream_t stream = 0);
-
-template <typename T_IN, typename T_OUT>
-void invokeCudaD2DScaleCpyConvert(
-    T_OUT* tgt, const T_IN* src, float const* scale, bool invert_scale, size_t const size, cudaStream_t stream = 0);
-
-inline bool checkIfFileExist(std::string const& file_path)
-{
-    std::ifstream in(file_path, std::ios::in | std::ios::binary);
-    if (in.is_open())
-    {
-        in.close();
-        return true;
-    }
-    return false;
-}
-
-template <typename T>
-void saveToBinary(T const* ptr, size_t const size, std::string filename);
-
-template <typename T_IN, typename T_fake_type>
-void invokeFakeCast(T_IN* input_ptr, size_t const size, cudaStream_t stream);
-
-size_t cuda_datatype_size(TRTLLMCudaDataType dt);
-
-template <typename T>
-bool invokeCheckRange(T const* buffer, size_t const size, T min, T max, bool* d_within_range, cudaStream_t stream);
-
-constexpr size_t DEFAULT_ALIGN_BYTES = 256;
-
-size_t calcAlignedSize(std::vector<size_t> const& sizes, size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES);
-void calcAlignedPointers(std::vector<void*>& outPtrs, void const* p, std::vector<size_t> const& sizes,
-    size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES);
-
-struct AlignedPointersUnpacker
-{
-    template <typename... T>
-    void operator()(T*&... outPtrs)
-    {
-        assert(sizeof...(T) == alignedPointers.size());
-        auto it = alignedPointers.begin();
-        ((outPtrs = static_cast<T*>(*it++)), ...);
-    }
-
-    std::vector<void*> alignedPointers;
-};
-
-AlignedPointersUnpacker inline calcAlignedPointers(
-    void const* p, std::vector<size_t> const& sizes, size_t ALIGN_BYTES = DEFAULT_ALIGN_BYTES)
-{
-    AlignedPointersUnpacker unpacker{};
-    calcAlignedPointers(unpacker.alignedPointers, p, sizes, ALIGN_BYTES);
-    return unpacker;
-}
-
-} // namespace common
-} // namespace tensorrt_llm
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp
deleted file mode 100644
index dbdaca4ee..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/mpiUtils.cpp
+++ /dev/null
@@ -1,588 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <numeric>
-#include <unordered_set>
-
-#include "tensorrt_llm/common/mpiUtils.h"
-
-#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/iBuffer.h"
-
-#include <csignal>
-#include <cstdlib>
-#include <mutex>
-#include <thread>
-#include <type_traits>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-// We rely on SizeType32 being int32_t in some places with weak type checking,
-// i.e. we're passing void ptr to some function. To prevent mysterious errors
-// in the future, we trigger a compilation error here if SizeType32 isn't int32_t.
-static_assert(std::is_same<tensorrt_llm::runtime::SizeType32, std::int32_t>::value);
-
-namespace tensorrt_llm::mpi
-{
-
-MPI_Datatype getMpiDtype(MpiType dtype)
-{
-#if ENABLE_MULTI_DEVICE
-    static std::unordered_map<MpiType, MPI_Datatype> const dtype_map{
-        {MpiType::kBYTE, MPI_BYTE},
-        {MpiType::kHALF, MPI_UINT16_T},
-        {MpiType::kFLOAT, MPI_FLOAT},
-        {MpiType::kDOUBLE, MPI_DOUBLE},
-        {MpiType::kBOOL, MPI_C_BOOL},
-        {MpiType::kINT8, MPI_INT8_T},
-        {MpiType::kUINT8, MPI_UINT8_T},
-        {MpiType::kINT32, MPI_INT32_T},
-        {MpiType::kUINT32, MPI_UINT32_T},
-        {MpiType::kINT64, MPI_INT64_T},
-        {MpiType::kUINT64, MPI_UINT64_T},
-        {MpiType::kFP8, MPI_UINT8_T},
-        {MpiType::kBF16, MPI_UINT16_T},
-        {MpiType::kCHAR, MPI_CHAR},
-    };
-    return dtype_map.at(dtype);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif
-}
-
-MPI_Op getMpiOp(MpiOp op)
-{
-#if ENABLE_MULTI_DEVICE
-    static std::unordered_map<MpiOp, MPI_Op> const op_map{
-        {MpiOp::NULLOP, MPI_OP_NULL},
-        {MpiOp::MAX, MPI_MAX},
-        {MpiOp::MIN, MPI_MIN},
-        {MpiOp::SUM, MPI_SUM},
-        {MpiOp::PROD, MPI_PROD},
-        {MpiOp::LAND, MPI_LAND},
-        {MpiOp::BAND, MPI_BAND},
-        {MpiOp::LOR, MPI_LOR},
-        {MpiOp::BOR, MPI_BOR},
-        {MpiOp::LXOR, MPI_LXOR},
-        {MpiOp::BXOR, MPI_BXOR},
-        {MpiOp::MINLOC, MPI_MINLOC},
-        {MpiOp::MAXLOC, MPI_MAXLOC},
-        {MpiOp::REPLACE, MPI_REPLACE},
-    };
-    return op_map.at(op);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-namespace
-{
-
-bool mpiInitialized = false;
-std::recursive_mutex mpiMutex;
-
-MpiComm initLocalSession()
-{
-#if ENABLE_MULTI_DEVICE
-    MPI_Comm localComm = nullptr;
-    MPI_Comm_split_type(COMM_SESSION, OMPI_COMM_TYPE_HOST, COMM_SESSION.getRank(), MPI_INFO_NULL, &localComm);
-    MpiComm localSession{localComm, false};
-#else
-    MpiComm localSession{COMM_SESSION, false};
-#endif // ENABLE_MULTI_DEVICE
-    return localSession;
-}
-
-} // namespace
-
-std::vector<int> getWorldRanks(MpiComm const& comm)
-{
-#if ENABLE_MULTI_DEVICE
-    MPI_Group group = nullptr;
-    MPI_Group worldGroup = nullptr;
-
-    MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
-    MPICHECK(MPI_Comm_group(comm, &group));
-
-    int groupSize = 0;
-    MPICHECK(MPI_Group_size(group, &groupSize));
-    std::vector<int> ranks(groupSize);
-    std::vector<int> worldRanks(groupSize);
-    std::iota(ranks.begin(), ranks.end(), 0);
-
-    MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data()));
-    MPICHECK(MPI_Group_free(&group));
-    MPICHECK(MPI_Group_free(&worldGroup));
-#else
-    std::vector<int> worldRanks{0};
-#endif
-    return worldRanks;
-}
-
-void initialize(MpiThreadSupport threadMode, bool forwardAbortToParent)
-{
-    // double-checked locking
-    if (mpiInitialized)
-    {
-        return;
-    }
-    std::lock_guard<std::recursive_mutex> lk(mpiMutex);
-    if (mpiInitialized)
-    {
-        return;
-    }
-#if ENABLE_MULTI_DEVICE
-    int initialized = 0;
-    TLLM_MPI_CHECK(MPI_Initialized(&initialized));
-    if (!initialized)
-    {
-        TLLM_LOG_INFO("Initializing MPI with thread mode %d", threadMode);
-        int providedMode = 0;
-        auto requiredMode = static_cast<int>(threadMode);
-        MPICHECK(MPI_Init_thread(nullptr, nullptr, requiredMode, &providedMode));
-        TLLM_CHECK_WITH_INFO(providedMode >= requiredMode, "MPI_Init_thread failed");
-        std::atexit([]() { MPI_Finalize(); });
-
-        /*
-         * We only catch SIGABRT and SIGSEGV because most, of not all errors in the worker will cause one of these 2
-         * signals. Signals like SIGINT and SIGTERM should be issued to the parent and should terminate MPI workers
-         * correctly.
-         */
-        for (int sig : {SIGABRT, SIGSEGV})
-        {
-            __sighandler_t previousHandler = nullptr;
-            if (forwardAbortToParent)
-            {
-                previousHandler = std::signal(sig,
-                    [](int signal)
-                    {
-#ifndef _WIN32
-                        pid_t parentProcessId = getppid();
-                        kill(parentProcessId, SIGKILL);
-#endif
-                        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
-                    });
-            }
-            else
-            {
-                previousHandler = std::signal(sig, [](int signal) { MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); });
-            }
-            TLLM_CHECK_WITH_INFO(previousHandler != SIG_ERR, "Signal handler setup failed");
-        }
-
-        // ensure local MPI communicator is initialized
-        MpiComm::localSession();
-        TLLM_LOG_INFO("Initialized MPI");
-    }
-#endif // ENABLE_MULTI_DEVICE
-    mpiInitialized = true;
-}
-
-void MpiComm::barrier() const
-{
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Barrier(mComm));
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-#if ENABLE_MULTI_DEVICE
-template <typename TMpiFunc, typename TBase, typename... TArgs,
-    typename = std::enable_if_t<std::is_same_v<void, std::remove_const_t<TBase>>>>
-size_t invokeChunked(TMpiFunc func, TBase* buffer, size_t size, MPI_Datatype dtype, TArgs... args)
-{
-    constexpr auto maxP1 = static_cast<size_t>(std::numeric_limits<int>::max()) + 1;
-    if (TLLM_LIKELY(size < maxP1))
-    {
-        MPICHECK(func(buffer, size, dtype, args...));
-        return 1;
-    }
-
-    constexpr size_t alignment = 256;
-    int elementSize = 1;
-    MPICHECK(MPI_Type_size(dtype, &elementSize));
-    elementSize = std::min<int>(elementSize, alignment);
-
-    // We cap at max alignment-bytes chunks that can be sent at once.
-    auto const step = maxP1 - (alignment / elementSize);
-
-    using TCast = std::conditional_t<std::is_const_v<TBase>, uint8_t const, uint8_t>;
-    size_t count = 0;
-    while (size != 0)
-    {
-        auto currentStep = static_cast<int>(std::min(size, step));
-        MPICHECK(func(buffer, currentStep, dtype, args...));
-        size -= currentStep;
-        size_t diff = static_cast<size_t>(currentStep) * elementSize;
-        buffer = static_cast<TCast*>(buffer) + diff;
-        ++count;
-    }
-
-    return count;
-}
-#endif // ENABLE_MULTI_DEVICE
-
-std::shared_ptr<MpiRequest> MpiComm::bcastAsync(void* buffer, size_t size, MpiType dtype, int root) const
-{
-    std::shared_ptr<MpiRequest> r = std::make_shared<MpiRequest>();
-#if ENABLE_MULTI_DEVICE
-    invokeChunked(MPI_Ibcast, buffer, size, getMpiDtype(dtype), root, mComm, &r->mRequest);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-    return r;
-}
-
-std::shared_ptr<MpiRequest> MpiComm::bcastAsync(runtime::IBuffer& buf, int root) const
-{
-    TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
-    return bcastAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
-}
-
-void MpiComm::bcast(void* buffer, size_t size, MpiType dtype, int root) const
-{
-#if ENABLE_MULTI_DEVICE
-    invokeChunked(MPI_Bcast, buffer, size, getMpiDtype(dtype), root, mComm);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-void MpiComm::bcast(runtime::IBuffer& buf, int root) const
-{
-    bcast(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
-}
-
-std::shared_ptr<MpiRequest> MpiComm::sendAsync(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const
-{
-    TLLM_LOG_DEBUG("start MPI_Isend with size %d", size);
-    std::shared_ptr<MpiRequest> r = std::make_shared<MpiRequest>();
-#if ENABLE_MULTI_DEVICE
-    invokeChunked(MPI_Isend, buffer, size, getMpiDtype(dtype), dest, tag, mComm, &r->mRequest);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif
-    TLLM_LOG_DEBUG("end MPI_Isend with size %d", size);
-    return r;
-}
-
-std::shared_ptr<MpiRequest> MpiComm::sendAsync(runtime::IBuffer const& buf, int dest, int tag) const
-{
-    return sendAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag);
-}
-
-void MpiComm::send(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const
-{
-    TLLM_LOG_DEBUG("start MPI_Send with size %d", size);
-#if ENABLE_MULTI_DEVICE
-    invokeChunked(MPI_Send, buffer, size, getMpiDtype(dtype), dest, tag, mComm);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-    TLLM_LOG_DEBUG("end MPI_Send with size %d", size);
-}
-
-void MpiComm::send(runtime::IBuffer const& buf, int dest, int tag) const
-{
-    send(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag);
-}
-
-MPI_Status MpiComm::recv(void* buffer, size_t size, MpiType dtype, int source, int tag) const
-{
-    TLLM_LOG_DEBUG("start MPI_Recv with size %d", size);
-    MPI_Status status{};
-#if ENABLE_MULTI_DEVICE
-    invokeChunked(MPI_Recv, buffer, size, getMpiDtype(dtype), source, tag, mComm, &status);
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-    TLLM_LOG_DEBUG("end MPI_Recv with size %d", size);
-    return status;
-}
-
-MPI_Status MpiComm::recv(runtime::IBuffer& buf, int source, int tag) const
-{
-    return recv(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, source, tag);
-}
-
-MpiComm MpiComm::split(int color, int key) const
-{
-    MPI_Comm splitComm = nullptr;
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Comm_split(mComm, color, key, &splitComm));
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-    return MpiComm{splitComm, true};
-}
-
-void MpiComm::allreduce(void const* sendbuf, void* recvbuf, int count, MpiType dtype, MpiOp op) const
-{
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Allreduce(sendbuf, recvbuf, count, getMpiDtype(dtype), getMpiOp(op), mComm));
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-void MpiComm::allgather(void const* sendbuf, void* recvbuf, int count, MpiType dtype) const
-{
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Allgather(sendbuf, count, getMpiDtype(dtype), recvbuf, count, getMpiDtype(dtype), mComm));
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-void MpiComm::allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf,
-    std::vector<int> const& recvcounts, std::vector<int> const& displs, MpiType recvtype) const
-{
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Allgatherv(sendbuf, sendcount, getMpiDtype(sendtype), recvbuf, recvcounts.data(), displs.data(),
-        getMpiDtype(recvtype), mComm));
-
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
-{
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Mprobe(source, tag, mComm, msg, status));
-#else
-    TLLM_THROW("Multi device support is disabled.");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-bool MpiComm::improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
-{
-#if ENABLE_MULTI_DEVICE
-    int flag{0};
-    MPICHECK(MPI_Improbe(source, tag, mComm, &flag, msg, status));
-    return flag != 0;
-#else
-    TLLM_THROW("Multi device support is disabled.");
-    return false;
-#endif
-}
-
-bool MpiComm::iprobe(int source, int tag, MPI_Status* status) const
-{
-#if ENABLE_MULTI_DEVICE
-    int flag{0};
-    MPICHECK(MPI_Iprobe(source, tag, mComm, &flag, status));
-    return flag != 0;
-#else
-    TLLM_THROW("Multi device support is disabled.");
-    return false;
-#endif
-}
-
-void MpiComm::recvPoll(int source, int tag, int periodMs) const
-{
-    MPI_Status status;
-    while (!iprobe(source, tag, &status))
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(periodMs));
-    }
-}
-
-int MpiComm::getRank() const
-{
-    int rank = 0;
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Comm_rank(mComm, &rank));
-#endif
-    return rank;
-}
-
-int MpiComm::getSize() const
-{
-    int world_size = 1;
-#if ENABLE_MULTI_DEVICE
-    MPICHECK(MPI_Comm_size(mComm, &world_size));
-#endif
-    return world_size;
-}
-
-MpiComm const& MpiComm::world()
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    static MpiComm commWorld{MPI_COMM_WORLD, false};
-    initialize();
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return commWorld;
-}
-
-MpiComm& MpiComm::mutableSession()
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    static MpiComm commSession{MPI_COMM_WORLD, false};
-    initialize();
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return commSession;
-}
-
-MpiComm& MpiComm::mutableLocalSession()
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    static MpiComm localSession = initLocalSession();
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return localSession;
-}
-
-void MpiComm::refreshLocalSession()
-{
-#if ENABLE_MULTI_DEVICE
-    static std::mutex mutex;
-    std::unique_lock lock(mutex);
-    auto initSessionRanks = getWorldRanks(MpiComm::session());
-    auto localSessionRanks = getWorldRanks(MpiComm::localSession());
-
-    // Add to intersectionRanks in order of initSessionRanks
-    std::vector<int> intersectionRanks;
-    std::unordered_set<int> localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end());
-    for (auto rank : initSessionRanks)
-    {
-        if (localSessionRanksSet.find(rank) != localSessionRanksSet.end())
-        {
-            intersectionRanks.push_back(rank);
-        }
-    }
-
-    MPI_Group worldGroup = nullptr;
-    MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
-    MPI_Group localGroup = nullptr;
-    MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
-    MPI_Comm localComm = nullptr;
-    MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
-    MpiComm::mutableLocalSession().mFreeComm = true;
-    MpiComm::mutableLocalSession() = MpiComm{localComm, false};
-    TLLM_LOG_INFO("Refreshed the MPI local session");
-#endif // ENABLE_MULTI_DEVICE
-}
-
-MpiComm::MpiComm(MPI_Comm g, bool freeComm)
-    : mComm{g}
-    , mFreeComm{freeComm}
-{
-    TLLM_CHECK(mComm != MPI_COMM_NULL);
-}
-
-MpiComm::~MpiComm() noexcept
-{
-#if ENABLE_MULTI_DEVICE
-    if (mFreeComm && mComm)
-    {
-        if (MPI_Comm_free(&mComm) != MPI_SUCCESS)
-        {
-            TLLM_LOG_ERROR("MPI_Comm_free failed");
-        }
-    }
-#endif // ENABLE_MULTI_DEVICE
-}
-
-MpiComm::MpiComm(MpiComm&& comm) noexcept
-    : mComm{comm.mComm}
-    , mFreeComm{comm.mFreeComm}
-{
-    comm.mFreeComm = false;
-}
-
-MpiComm& MpiComm::operator=(MpiComm&& comm) noexcept
-{
-    this->~MpiComm();
-    mComm = comm.mComm;
-    mFreeComm = comm.mFreeComm;
-    comm.mFreeComm = false;
-    return *this;
-}
-
-MpiWaitThread::MpiWaitThread(std::string name, std::function<void()> funcWait, std::function<void()> funcSetup)
-    : mName{name.c_str()}
-    , mFuncWait{funcWait}
-    , mFuncSetup{funcSetup}
-{
-    TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
-    mThread = std::make_unique<std::thread>(&MpiWaitThread::sideThread, this);
-    TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
-}
-
-MpiWaitThread::~MpiWaitThread()
-{
-    TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
-    waitStop();
-    mShouldExit.store(true);
-    notifyStart();
-    mThread->join();
-    mThread.reset(nullptr);
-    TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
-}
-
-void MpiWaitThread::sideThread()
-{
-    if (mFuncSetup)
-    {
-        mFuncSetup();
-    }
-    while (!mShouldExit.load())
-    {
-        notifyStop();
-        waitStart();
-        mFuncWait();
-    }
-}
-
-void MpiWaitThread::waitStart()
-{
-    TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
-    std::unique_lock<std::mutex> lock(mMutex);
-    mCondVar.wait(lock, [this] { return mRunning; });
-    TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
-}
-
-void MpiWaitThread::waitStop()
-{
-    TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
-    std::unique_lock<std::mutex> lock(mMutex);
-    mCondVar.wait(lock, [this] { return !mRunning; });
-    TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
-}
-
-void MpiWaitThread::notifyStart()
-{
-    TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
-    std::lock_guard<std::mutex> lock(mMutex);
-    mRunning = true;
-    mCondVar.notify_one();
-    TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
-}
-
-void MpiWaitThread::notifyStop()
-{
-    TLLM_LOG_TRACE("%s: %s start", mName.c_str(), __PRETTY_FUNCTION__);
-    std::lock_guard<std::mutex> lock(mMutex);
-    mRunning = false;
-    mCondVar.notify_one();
-    TLLM_LOG_TRACE("%s: %s stop", mName.c_str(), __PRETTY_FUNCTION__);
-}
-
-} // namespace tensorrt_llm::mpi
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h
deleted file mode 100644
index 0a9d51975..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/nvtxUtils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nvtx3/nvtx3.hpp>
-
-#include <array>
-
-namespace tensorrt_llm::common::nvtx
-{
-inline nvtx3::color nextColor()
-{
-#ifndef NVTX_DISABLE
-    constexpr std::array kColors{nvtx3::color{0xff00ff00}, nvtx3::color{0xff0000ff}, nvtx3::color{0xffffff00},
-        nvtx3::color{0xffff00ff}, nvtx3::color{0xff00ffff}, nvtx3::color{0xffff0000}, nvtx3::color{0xffffffff}};
-    constexpr auto numColors = kColors.size();
-
-    static thread_local std::size_t colorId = 0;
-    auto const color = kColors[colorId];
-    colorId = colorId + 1 >= numColors ? 0 : colorId + 1;
-    return color;
-#else
-    return nvtx3::color{0};
-#endif
-}
-
-} // namespace tensorrt_llm::common::nvtx
-
-#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name)                                                                      \
-    ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
-#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp
deleted file mode 100644
index 39aefda48..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "tensorrt_llm/common/opUtils.h"
-#include "tensorrt_llm/common/mpiUtils.h"
-
-#include "cuda.h"
-#include <cstdint>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_fp8.h>
-#include <functional>
-#include <mutex>
-#include <thread>
-
-#ifdef _MSC_VER
-#define FN_NAME __FUNCTION__
-#else
-#define FN_NAME __func__
-#endif
-
-#if ENABLE_MULTI_DEVICE
-
-std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
-{
-    static std::unordered_map<nvinfer1::DataType, ncclDataType_t> dtypeMap = {{nvinfer1::DataType::kFLOAT, ncclFloat32},
-        {nvinfer1::DataType::kHALF, ncclFloat16}, {nvinfer1::DataType::kBF16, ncclBfloat16}};
-    return &dtypeMap;
-}
-
-namespace
-{
-
-// Get NCCL unique ID for a group of ranks.
-ncclUniqueId getUniqueId(std::set<int> const& group) noexcept
-{
-    auto const rank = COMM_SESSION.getRank();
-    TLLM_LOG_TRACE("%s start for rank %d", __PRETTY_FUNCTION__, rank);
-    ncclUniqueId id;
-    if (rank == *group.begin())
-    {
-        NCCLCHECK(ncclGetUniqueId(&id));
-        for (auto it = std::next(std::begin(group), 1); it != group.end(); ++it)
-        {
-            COMM_SESSION.sendValue(id, *it, 0);
-        }
-    }
-    else
-    {
-        COMM_SESSION.recvValue(id, *group.begin(), 0);
-    }
-    TLLM_LOG_TRACE("%s stop for rank %d", __PRETTY_FUNCTION__, rank);
-    return id;
-}
-} // namespace
-
-std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group)
-{
-    auto const rank = COMM_SESSION.getRank();
-    TLLM_LOG_TRACE("%s start for rank %d", __PRETTY_FUNCTION__, rank);
-    static std::map<std::set<int>, std::shared_ptr<ncclComm_t>> commMap;
-    static std::mutex mutex;
-    std::lock_guard<std::mutex> lock(mutex);
-    std::ostringstream oss;
-    int index = 0;
-    for (auto const& rank : group)
-    {
-        if (index != 0)
-        {
-            oss << ",";
-        }
-        oss << rank;
-        index++;
-    }
-    auto groupStr = oss.str();
-    auto it = commMap.find(group);
-    if (it != commMap.end())
-    {
-        auto ncclComm = it->second;
-        TLLM_LOG_TRACE("NCCL comm for group(%s) is cached for rank %d", groupStr.c_str(), rank);
-        return ncclComm;
-    }
-
-    TLLM_LOG_TRACE("Init NCCL comm for group(%s) for rank %d", groupStr.c_str(), rank);
-    ncclUniqueId id = getUniqueId(group);
-    int groupRank = 0;
-    for (auto const& currentRank : group)
-    {
-        if (rank == currentRank)
-            break;
-        ++groupRank;
-    }
-    TLLM_CHECK(groupRank < group.size());
-    std::shared_ptr<ncclComm_t> ncclComm(new ncclComm_t,
-        [](ncclComm_t* comm)
-        {
-            ncclCommDestroy(*comm);
-            delete comm;
-        });
-    NCCLCHECK(ncclCommInitRank(ncclComm.get(), group.size(), id, groupRank));
-    commMap[group] = ncclComm;
-    TLLM_LOG_TRACE("%s stop for rank %d", __PRETTY_FUNCTION__, rank);
-    return ncclComm;
-}
-#endif // ENABLE_MULTI_DEVICE
-
-void const* tensorrt_llm::common::getCommSessionHandle()
-{
-#if ENABLE_MULTI_DEVICE
-    return &COMM_SESSION;
-#else
-    return nullptr;
-#endif // ENABLE_MULTI_DEVICE
-}
-
-namespace
-{
-
-// Get current cuda context, a default context will be created if there is no context.
-inline CUcontext getCurrentCudaCtx()
-{
-    CUcontext ctx{};
-    CUresult err = cuCtxGetCurrent(&ctx);
-    if (err == CUDA_ERROR_NOT_INITIALIZED || ctx == nullptr)
-    {
-        TLLM_CUDA_CHECK(cudaFree(nullptr));
-        err = cuCtxGetCurrent(&ctx);
-    }
-    TLLM_CHECK(err == CUDA_SUCCESS);
-    return ctx;
-}
-
-// Helper to create per-cuda-context singleton managed by std::shared_ptr.
-// Unlike conventional singletons, singleton created with this will be released
-// when not needed, instead of on process exit.
-// Objects of this class shall always be declared static / global, and shall never own CUDA
-// resources.
-template <typename T>
-class PerCudaCtxSingletonCreator
-{
-public:
-    using CreatorFunc = std::function<std::unique_ptr<T>()>;
-    using DeleterFunc = std::function<void(T*)>;
-
-    // creator returning std::unique_ptr is by design.
-    // It forces separation of memory for T and memory for control blocks.
-    // So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released.
-    // creator itself must not own CUDA resources. Only the object it creates can.
-    PerCudaCtxSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
-        : mCreator{std::move(creator)}
-        , mDeleter{std::move(deleter)}
-    {
-    }
-
-    std::shared_ptr<T> operator()()
-    {
-        std::lock_guard<std::mutex> lk{mMutex};
-        CUcontext ctx{getCurrentCudaCtx()};
-        std::shared_ptr<T> result = mObservers[ctx].lock();
-        if (result == nullptr)
-        {
-            // Create the resource and register with an observer.
-            result = std::shared_ptr<T>{mCreator().release(),
-                [this, ctx](T* obj)
-                {
-                    if (obj == nullptr)
-                    {
-                        return;
-                    }
-                    mDeleter(obj);
-
-                    // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
-                    // frequently.
-                    std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
-                    std::lock_guard<std::mutex> lk{mMutex};
-                    // Must check observer again because another thread may created new instance for this ctx just
-                    // before we lock mMutex. We can't infer that the observer is stale from the fact that obj is
-                    // destroyed, because shared_ptr ref-count checking and observer removing are not in one atomic
-                    // operation, and the observer may be changed to observe another instance.
-                    observedObjHolder = mObservers.at(ctx).lock();
-                    if (observedObjHolder == nullptr)
-                    {
-                        mObservers.erase(ctx);
-                    }
-                }};
-            mObservers.at(ctx) = result;
-        }
-        return result;
-    }
-
-private:
-    CreatorFunc mCreator;
-    DeleterFunc mDeleter;
-    mutable std::mutex mMutex;
-    // CUDA resources are per-context.
-    std::unordered_map<CUcontext, std::weak_ptr<T>> mObservers;
-};
-
-template <typename T>
-class PerThreadSingletonCreator
-{
-public:
-    using CreatorFunc = std::function<std::unique_ptr<T>()>;
-    using DeleterFunc = std::function<void(T*)>;
-
-    // creator returning std::unique_ptr is by design.
-    // It forces separation of memory for T and memory for control blocks.
-    // So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released.
-    // creator itself must not own CUDA resources. Only the object it creates can.
-    PerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
-        : mCreator{std::move(creator)}
-        , mDeleter{std::move(deleter)}
-    {
-    }
-
-    std::shared_ptr<T> operator()()
-    {
-        std::lock_guard<std::mutex> lk{mMutex};
-
-        std::thread::id thread = std::this_thread::get_id();
-        std::shared_ptr<T> result = mObservers[thread].lock();
-
-        if (result == nullptr)
-        {
-            // Create the resource and register with an observer.
-            result = std::shared_ptr<T>{mCreator().release(),
-                [this, thread](T* obj)
-                {
-                    if (obj == nullptr)
-                    {
-                        return;
-                    }
-                    mDeleter(obj);
-
-                    // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
-                    // frequently.
-                    std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
-                    std::lock_guard<std::mutex> lk{mMutex};
-                    // Must check observer again because another thread may created new instance for this ctx just
-                    // before we lock mMutex. We can't infer that the observer is stale from the fact that obj is
-                    // destroyed, because shared_ptr ref-count checking and observer removing are not in one atomic
-                    // operation, and the observer may be changed to observe another instance.
-                    observedObjHolder = mObservers.at(thread).lock();
-                    if (observedObjHolder == nullptr)
-                    {
-                        mObservers.erase(thread);
-                    }
-                }};
-            mObservers.at(thread) = result;
-        }
-        return result;
-    }
-
-private:
-    CreatorFunc mCreator;
-    DeleterFunc mDeleter;
-    mutable std::mutex mMutex;
-    // CUDA resources are per-thread.
-    std::unordered_map<std::thread::id, std::weak_ptr<T>> mObservers;
-};
-
-} // namespace
-
-std::shared_ptr<cublasHandle_t> getCublasHandle()
-{
-    static PerThreadSingletonCreator<cublasHandle_t> creator(
-        []() -> auto
-        {
-            auto handle = std::unique_ptr<cublasHandle_t>(new cublasHandle_t);
-            TLLM_CUDA_CHECK(cublasCreate(handle.get()));
-            return handle;
-        },
-        [](cublasHandle_t* handle)
-        {
-            TLLM_CUDA_CHECK(cublasDestroy(*handle));
-            delete handle;
-        });
-    return creator();
-}
-
-std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
-{
-    static PerThreadSingletonCreator<cublasLtHandle_t> creator(
-        []() -> auto
-        {
-            auto handle = std::unique_ptr<cublasLtHandle_t>(new cublasLtHandle_t);
-            TLLM_CUDA_CHECK(cublasLtCreate(handle.get()));
-            return handle;
-        },
-        [](cublasLtHandle_t* handle)
-        {
-            TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
-            delete handle;
-        });
-    return creator();
-}
-
-std::shared_ptr<tensorrt_llm::common::CublasMMWrapper> getCublasMMWrapper(std::shared_ptr<cublasHandle_t> cublasHandle,
-    std::shared_ptr<cublasLtHandle_t> cublasltHandle, cudaStream_t stream, void* workspace)
-{
-    static PerThreadSingletonCreator<tensorrt_llm::common::CublasMMWrapper> creator(
-        [cublasHandle, cublasltHandle, stream, workspace]() -> auto
-        {
-            auto wrapper = std::unique_ptr<tensorrt_llm::common::CublasMMWrapper>(
-                new tensorrt_llm::common::CublasMMWrapper(cublasHandle, cublasltHandle, stream, workspace));
-            return wrapper;
-        },
-        [](tensorrt_llm::common::CublasMMWrapper* wrapper) { delete wrapper; });
-    return creator();
-}
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h
deleted file mode 100644
index 4e278e5cf..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/opUtils.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/common/cublasMMWrapper.h"
-#include "tensorrt_llm/common/workspace.h"
-
-#include <NvInferRuntime.h>
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#if ENABLE_MULTI_DEVICE
-#include <nccl.h>
-#endif // ENABLE_MULTI_DEVICE
-
-#include <cstring>
-#include <map>
-#include <memory>
-#include <nvml.h>
-#include <optional>
-#include <set>
-#include <string>
-#include <unordered_map>
-
-namespace tensorrt_llm::common
-{
-
-// Write values into buffer
-template <typename T>
-void write(char*& buffer, T const& val)
-{
-    std::memcpy(buffer, &val, sizeof(T));
-    buffer += sizeof(T);
-}
-
-// Read values from buffer
-template <typename T>
-void read(char const*& buffer, T& val)
-{
-    std::memcpy(&val, buffer, sizeof(T));
-    buffer += sizeof(T);
-}
-
-// Like std::unique_ptr, but does not prevent generation of default copy constructor when used as class members.
-// The copy constructor produces nullptr. So the plugin default copy constructor will not really copy this, and
-// your clone() implementation is responsible for initializing such data members.
-// With this we can simplify clone() implementation when there are many data members including at least one unique_ptr.
-template <typename T, typename Del = std::default_delete<T>>
-class UniqPtrWNullCopy : public std::unique_ptr<T, Del>
-{
-public:
-    using std::unique_ptr<T, Del>::unique_ptr;
-
-    // for compatibility with std::make_unique
-    explicit UniqPtrWNullCopy(std::unique_ptr<T, Del>&& src)
-        : std::unique_ptr<T, Del>::unique_ptr{std::move(src)}
-    {
-    }
-
-    // copy constructor produces nullptr
-    UniqPtrWNullCopy(UniqPtrWNullCopy const&)
-        : std::unique_ptr<T, Del>::unique_ptr{}
-    {
-    }
-};
-
-// for testing only
-void const* getCommSessionHandle();
-} // namespace tensorrt_llm::common
-
-inline bool isBuilding()
-{
-    auto constexpr key = "IS_BUILDING";
-    auto const val = getenv(key);
-    return val != nullptr && std::string(val) == "1";
-}
-
-#if ENABLE_MULTI_DEVICE
-#define NCCLCHECK(cmd)                                                                                                 \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        ncclResult_t r = cmd;                                                                                          \
-        if (r != ncclSuccess)                                                                                          \
-        {                                                                                                              \
-            printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r));                      \
-            exit(EXIT_FAILURE);                                                                                        \
-        }                                                                                                              \
-    } while (0)
-
-std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap();
-
-std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
-
-#endif // ENABLE_MULTI_DEVICE
-
-//! To save GPU memory, all the plugins share the same cublas and cublasLt handle globally.
-//! Get cublas and cublasLt handle for current cuda context
-std::shared_ptr<cublasHandle_t> getCublasHandle();
-std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
-std::shared_ptr<tensorrt_llm::common::CublasMMWrapper> getCublasMMWrapper(std::shared_ptr<cublasHandle_t> cublasHandle,
-    std::shared_ptr<cublasLtHandle_t> cublasltHandle, cudaStream_t stream, void* workspace);
-
-#ifndef DEBUG
-
-#define PLUGIN_CHECK(status)                                                                                           \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        if (status != 0)                                                                                               \
-            abort();                                                                                                   \
-    } while (0)
-
-#define ASSERT_PARAM(exp)                                                                                              \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        if (!(exp))                                                                                                    \
-            return STATUS_BAD_PARAM;                                                                                   \
-    } while (0)
-
-#define ASSERT_FAILURE(exp)                                                                                            \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        if (!(exp))                                                                                                    \
-            return STATUS_FAILURE;                                                                                     \
-    } while (0)
-
-#define CSC(call, err)                                                                                                 \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        cudaError_t cudaStatus = call;                                                                                 \
-        if (cudaStatus != cudaSuccess)                                                                                 \
-        {                                                                                                              \
-            return err;                                                                                                \
-        }                                                                                                              \
-    } while (0)
-
-#define DEBUG_PRINTF(...)                                                                                              \
-    do                                                                                                                 \
-    {                                                                                                                  \
-    } while (0)
-
-#else
-
-#define ASSERT_PARAM(exp)                                                                                              \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        if (!(exp))                                                                                                    \
-        {                                                                                                              \
-            fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__);                                      \
-            return STATUS_BAD_PARAM;                                                                                   \
-        }                                                                                                              \
-    } while (0)
-
-#define ASSERT_FAILURE(exp)                                                                                            \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        if (!(exp))                                                                                                    \
-        {                                                                                                              \
-            fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__);                                        \
-            return STATUS_FAILURE;                                                                                     \
-        }                                                                                                              \
-    } while (0)
-
-#define CSC(call, err)                                                                                                 \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        cudaError_t cudaStatus = call;                                                                                 \
-        if (cudaStatus != cudaSuccess)                                                                                 \
-        {                                                                                                              \
-            printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus));                        \
-            return err;                                                                                                \
-        }                                                                                                              \
-    } while (0)
-
-#define PLUGIN_CHECK(status)                                                                                           \
-    {                                                                                                                  \
-        if (status != 0)                                                                                               \
-        {                                                                                                              \
-            DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status));                      \
-            abort();                                                                                                   \
-        }                                                                                                              \
-    }
-
-#define DEBUG_PRINTF(...)                                                                                              \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        printf(__VA_ARGS__);                                                                                           \
-    } while (0)
-
-#endif // DEBUG
-
-#define NVML_CHECK(cmd)                                                                                                \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        nvmlReturn_t r = cmd;                                                                                          \
-        if (r != NVML_SUCCESS)                                                                                         \
-        {                                                                                                              \
-            printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r));                         \
-            exit(EXIT_FAILURE);                                                                                        \
-        }                                                                                                              \
-    } while (0)
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h b/sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
new file mode 100644
index 000000000..052d9c8c8
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/quantization.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+namespace tensorrt_llm
+{
+namespace common
+{
+
+class QuantMode
+{
+    // [WARNING] KEEP BELOW DEFINITION IN SYNC WITH tensorrt_llm/quantization/mode.py
+public:
+    using BaseType = std::uint32_t;
+
+    explicit constexpr QuantMode(BaseType value) noexcept
+        : mValue{value}
+    {
+    }
+
+    QuantMode() noexcept = default;
+
+    constexpr QuantMode(QuantMode const&) noexcept = default;
+
+    constexpr QuantMode& operator=(QuantMode const& other) noexcept = default;
+
+    static constexpr QuantMode none() noexcept
+    {
+        return QuantMode(BaseType(0));
+    }
+
+    static constexpr QuantMode int4Weights() noexcept
+    {
+        return QuantMode(BaseType(1u) << 0);
+    }
+
+    static constexpr QuantMode int8Weights() noexcept
+    {
+        return QuantMode(BaseType(1u) << 1);
+    }
+
+    static constexpr QuantMode activations() noexcept
+    {
+        return QuantMode(BaseType(1u) << 2);
+    }
+
+    static constexpr QuantMode perChannelScaling() noexcept
+    {
+        return QuantMode(BaseType(1u) << 3);
+    }
+
+    static constexpr QuantMode perTokenScaling() noexcept
+    {
+        return QuantMode(BaseType(1u) << 4);
+    }
+
+    static constexpr QuantMode perGroupScaling() noexcept
+    {
+        return QuantMode(BaseType(1u) << 5);
+    }
+
+    static constexpr QuantMode int8KvCache() noexcept
+    {
+        return QuantMode(BaseType(1u) << 6);
+    }
+
+    static constexpr QuantMode fp8KvCache() noexcept
+    {
+        return QuantMode(BaseType(1u) << 7);
+    }
+
+    static constexpr QuantMode fp8Qdq() noexcept
+    {
+        return QuantMode(BaseType(1u) << 8);
+    }
+
+    static constexpr QuantMode fp8RowWise() noexcept
+    {
+        return QuantMode(BaseType(1u) << 3 | BaseType(1u) << 4 | BaseType(1u) << 9);
+    }
+
+    static constexpr QuantMode w4a8QServe() noexcept
+    {
+        return QuantMode(BaseType(1u) << 10);
+    }
+
+    constexpr BaseType value() const noexcept
+    {
+        return mValue;
+    }
+
+    constexpr bool isSet(QuantMode const& mode) const noexcept
+    {
+        return (mValue & mode.value()) == mode.value();
+    }
+
+    constexpr bool hasInt4Weights() const noexcept
+    {
+        return isSet(int4Weights());
+    }
+
+    constexpr bool hasInt8Weights() const noexcept
+    {
+        return isSet(int8Weights());
+    }
+
+    constexpr bool hasActivations() const noexcept
+    {
+        return isSet(activations());
+    }
+
+    constexpr bool hasPerChannelScaling() const noexcept
+    {
+        return isSet(perChannelScaling());
+    }
+
+    constexpr bool hasPerTokenScaling() const noexcept
+    {
+        return isSet(perTokenScaling());
+    }
+
+    constexpr bool hasPerGroupScaling() const noexcept
+    {
+        return isSet(perGroupScaling());
+    }
+
+    constexpr bool hasStaticActivationScaling() const noexcept
+    {
+        return !hasPerTokenScaling();
+    }
+
+    constexpr bool hasInt8KvCache() const noexcept
+    {
+        return isSet(int8KvCache());
+    }
+
+    constexpr bool hasFp8KvCache() const noexcept
+    {
+        return isSet(fp8KvCache());
+    }
+
+    constexpr bool hasFp8Qdq() const noexcept
+    {
+        return isSet(fp8Qdq());
+    }
+
+    constexpr bool hasFp8RowWise() const noexcept
+    {
+        return isSet(fp8RowWise());
+    }
+
+    constexpr bool hasKvCacheQuant() const noexcept
+    {
+        return hasInt8KvCache() || hasFp8KvCache();
+    }
+
+    static constexpr QuantMode fromDescription(bool quantizeWeights = false, bool quantizeActivations = false,
+        bool perToken = false, bool perChannel = false, bool perGroup = false, bool useInt4Weights = false,
+        bool useInt8KvCache = false, bool useFp8KvCache = false, bool useFp8Qdq = false, bool useFp8RowWise = false,
+        bool useW4a8QServe = false)
+    {
+        QuantMode quantMode{};
+        if (quantizeWeights)
+        {
+            if (useInt4Weights)
+                quantMode += int4Weights();
+            else
+                quantMode += int8Weights();
+        }
+
+        if (quantizeActivations)
+        {
+            quantMode += activations();
+        }
+
+        if (perChannel)
+        {
+            quantMode += QuantMode::perChannelScaling();
+        }
+        if (perToken)
+        {
+            quantMode += QuantMode::perTokenScaling();
+        }
+        if (perGroup)
+        {
+            quantMode += QuantMode::perGroupScaling();
+        }
+
+        if (useInt8KvCache)
+        {
+            quantMode += int8KvCache();
+        }
+
+        if (useFp8KvCache)
+        {
+            quantMode += fp8KvCache();
+        }
+
+        if (useFp8Qdq)
+        {
+            quantMode += fp8Qdq();
+        }
+
+        if (useFp8RowWise)
+        {
+            quantMode += fp8RowWise();
+        }
+
+        if (useW4a8QServe)
+        {
+            quantMode += w4a8QServe();
+        }
+
+        return quantMode;
+    }
+
+    static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)
+    {
+        return fromDescription(true, true, perToken, perChannel);
+    }
+
+    static constexpr QuantMode useQServe(bool perGroup)
+    {
+        return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true);
+    }
+
+    static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)
+    {
+        return fromDescription(true, false, false, false, perGroup, useInt4Weights);
+    }
+
+    static QuantMode const fromQuantAlgo(
+        std::optional<std::string> quantAlgo = std::nullopt, std::optional<std::string> kvCacheQuantAlgo = std::nullopt)
+    {
+        QuantMode quantMode{};
+        if (quantAlgo == "W8A16")
+        {
+            quantMode = useWeightOnly(false, false);
+        }
+        else if (quantAlgo == "W4A16")
+        {
+            quantMode = useWeightOnly(true, false);
+        }
+        else if (quantAlgo == "W4A16_AWQ")
+        {
+            quantMode = useWeightOnly(true, true);
+        }
+        else if (quantAlgo == "W4A8_AWQ")
+        {
+            quantMode = useWeightOnly(true, true);
+        }
+        else if (quantAlgo == "W4A8_QSERVE_PER_GROUP")
+        {
+            quantMode = useQServe(false);
+        }
+        else if (quantAlgo == "W4A8_QSERVE_PER_CHANNEL")
+        {
+            quantMode = useQServe(true);
+        }
+        else if (quantAlgo == "W4A16_GPTQ")
+        {
+            quantMode = useWeightOnly(true, true);
+        }
+        else if (quantAlgo == "W8A8_SQ_PER_CHANNEL")
+        {
+            quantMode = useSmoothQuant(false, true);
+        }
+        else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PLUGIN")
+        {
+            quantMode = useSmoothQuant(false, false);
+        }
+        else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN")
+        {
+            quantMode = useSmoothQuant(true, true);
+        }
+        else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN")
+        {
+            quantMode = useSmoothQuant(false, true);
+        }
+        else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN")
+        {
+            quantMode = useSmoothQuant(true, false);
+        }
+        else if (quantAlgo == "FP8")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, true);
+        }
+        else if (quantAlgo == "FP8_ROWWISE")
+        {
+            quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true);
+        }
+
+        if (kvCacheQuantAlgo == "INT8")
+        {
+            quantMode += int8KvCache();
+        }
+        else if (kvCacheQuantAlgo == "FP8")
+        {
+            quantMode += fp8KvCache();
+        }
+
+        return quantMode;
+    }
+
+    constexpr QuantMode operator+(QuantMode const& other) const noexcept
+    {
+        return QuantMode(mValue | other.mValue);
+    }
+
+    constexpr QuantMode& operator+=(QuantMode const& other) noexcept
+    {
+        return *this = *this + other;
+    }
+
+    constexpr QuantMode operator-(QuantMode const& other) const noexcept
+    {
+        return QuantMode(mValue & ~other.mValue);
+    }
+
+    constexpr QuantMode& operator-=(QuantMode const& other) noexcept
+    {
+        return *this = *this - other;
+    }
+
+    constexpr bool operator==(QuantMode const& other) const noexcept
+    {
+        return mValue == other.mValue;
+    }
+
+    constexpr bool operator!=(QuantMode const& other) const noexcept
+    {
+        return !(*this == other);
+    }
+
+private:
+    BaseType mValue{0};
+};
+
+} // namespace common
+} // namespace tensorrt_llm
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h
deleted file mode 100644
index 9cda9fa0d..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/stlUtils.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <functional>
-#include <numeric>
-#include <optional>
-#include <sstream>
-
-namespace tensorrt_llm::common::stl_utils
-{
-
-template <typename TInputIt, typename TOutputIt, typename TBinOp>
-constexpr TOutputIt basicInclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, TBinOp op)
-{
-    if (first != last)
-    {
-        auto val = *first;
-        while (true)
-        {
-            *dFirst = val;
-            ++dFirst;
-            ++first;
-            if (first == last)
-            {
-                break;
-            }
-            val = op(std::move(val), *first);
-        }
-    }
-    return dFirst;
-}
-
-template <typename TInputIt, typename TOutputIt>
-constexpr TOutputIt inclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst)
-{
-#if defined(__GNUC__) && __GNUC__ <= 8
-    return basicInclusiveScan(first, last, dFirst, std::plus<>{});
-#else
-    return std::inclusive_scan(first, last, dFirst);
-#endif
-}
-
-template <typename TInputIt, typename TOutputIt, typename T, typename TBinOp>
-constexpr TOutputIt basicExclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, T init, TBinOp op)
-{
-    if (first != last)
-    {
-        while (true)
-        {
-            T tmp{op(init, *first)};
-            *dFirst = init;
-            ++dFirst;
-            ++first;
-            if (first == last)
-            {
-                break;
-            }
-            init = std::move(tmp);
-        }
-    }
-    return dFirst;
-}
-
-template <typename TInputIt, typename TOutputIt, typename T>
-constexpr TOutputIt exclusiveScan(TInputIt first, TInputIt last, TOutputIt dFirst, T init)
-{
-#if defined(__GNUC__) && __GNUC__ <= 8
-    return basicExclusiveScan(first, last, dFirst, std::move(init), std::plus<>{});
-#else
-    return std::exclusive_scan(first, last, dFirst, std::move(init));
-#endif
-}
-
-template <typename T, typename = void>
-struct HasOperatorOutput : std::false_type
-{
-};
-
-template <typename T>
-struct HasOperatorOutput<T, std::void_t<decltype((std::declval<std::ostream&>() << std::declval<T>()))>>
-    : std::true_type
-{
-};
-
-template <typename T>
-std::string toString(T const& t, typename std::enable_if_t<HasOperatorOutput<T>::value, int> = 0)
-{
-    std::ostringstream oss;
-    oss << t;
-    return oss.str();
-}
-
-template <typename T>
-std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOperatorOutput<T>::value, int> = 0)
-{
-    std::ostringstream oss;
-    if (t)
-    {
-        oss << t.value();
-    }
-    else
-    {
-        oss << "None";
-    }
-    return oss.str();
-}
-
-} // namespace tensorrt_llm::common::stl_utils
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
new file mode 100644
index 000000000..9c5ecde98
--- /dev/null
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/stringUtils.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if ENABLE_BF16
+#include <cuda_bf16.h>
+#endif // ENABLE_BF16
+#include <cuda_fp16.h>
+
+#include <memory>  // std::make_unique
+#include <sstream> // std::stringstream
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace tensorrt_llm::common
+{
+#if ENABLE_BF16
+static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
+{
+    stream << __bfloat162float(val);
+    return stream;
+}
+#endif // ENABLE_BF16
+
+static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __half const& val)
+{
+    stream << __half2float(val);
+    return stream;
+}
+
+inline std::string fmtstr(std::string const& s)
+{
+    return s;
+}
+
+inline std::string fmtstr(std::string&& s)
+{
+    return s;
+}
+
+#if defined(_MSC_VER)
+std::string fmtstr(char const* format, ...);
+#else
+std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2)));
+#endif
+
+// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows
+// The alternative is __FUNCSIG__, which is similar but not identical
+#if defined(_WIN32)
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
+auto constexpr kDefaultDelimiter = ", ";
+
+template <typename U, typename TStream, typename T>
+inline TStream& arr2outCasted(TStream& out, T* arr, size_t size, char const* delim = kDefaultDelimiter)
+{
+    out << "(";
+    if (size > 0)
+    {
+        for (size_t i = 0; i < size - 1; ++i)
+        {
+            out << static_cast<U>(arr[i]) << delim;
+        }
+        out << static_cast<U>(arr[size - 1]);
+    }
+    out << ")";
+    return out;
+}
+
+template <typename TStream, typename T>
+inline TStream& arr2out(TStream& out, T* arr, size_t size, char const* delim = kDefaultDelimiter)
+{
+    return arr2outCasted<T>(out, arr, size, delim);
+}
+
+template <typename T>
+inline std::string arr2str(T* arr, size_t size, char const* delim = kDefaultDelimiter)
+{
+    std::stringstream ss;
+    return arr2out(ss, arr, size, delim).str();
+}
+
+template <typename T>
+inline std::string vec2str(std::vector<T> const& vec, char const* delim = kDefaultDelimiter)
+{
+    return arr2str(vec.data(), vec.size(), delim);
+}
+
+inline bool strStartsWith(std::string const& str, std::string const& prefix)
+{
+    return str.rfind(prefix, 0) == 0;
+}
+
+/// @brief Split a string into a set of strings using a delimiter
+std::unordered_set<std::string> str2set(std::string const& input, char delimiter);
+
+} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp b/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp
deleted file mode 100644
index c00041abd..000000000
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <chrono>
-#include <iomanip>
-#include <sstream>
-
-#include "tensorrt_llm/common/timestampUtils.h"
-
-namespace tensorrt_llm::common
-{
-
-std::string getCurrentTimestamp()
-{
-    auto now = std::chrono::system_clock::now();
-    auto now_t = std::chrono::system_clock::to_time_t(now);
-    auto tm = *std::localtime(&now_t);
-
-    auto epoch_to_now = now.time_since_epoch();
-    auto seconds = std::chrono::duration_cast<std::chrono::seconds>(epoch_to_now);
-    auto us = std::chrono::duration_cast<std::chrono::microseconds>(epoch_to_now - seconds);
-
-    std::ostringstream stream;
-    stream << std::put_time(&tm, "%m-%d-%Y %H:%M:%S");
-    stream << "." << std::setfill('0') << std::setw(6) << us.count();
-    return stream.str();
-}
-
-} // namespace tensorrt_llm::common
diff --git a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.h b/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h
similarity index 50%
rename from sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.h
rename to sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h
index f52f23028..47e0e63d3 100644
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/timestampUtils.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h
@@ -14,12 +14,35 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <stdexcept>
 #include <string>
 
+#define NEW_TLLM_EXCEPTION(...)                                                                                        \
+    tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__))
+
 namespace tensorrt_llm::common
 {
 
-/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
-std::string getCurrentTimestamp();
+class TllmException : public std::runtime_error
+{
+public:
+    static auto constexpr MAX_FRAMES = 128;
+
+    explicit TllmException(char const* file, std::size_t line, std::string const& msg);
+
+    ~TllmException() noexcept override;
+
+    [[nodiscard]] std::string getTrace() const;
+
+    static std::string demangle(char const* name);
+
+private:
+    std::array<void*, MAX_FRAMES> mCallstack{};
+    int mNbFrames;
+};
 
 } // namespace tensorrt_llm::common