perf: Avoid unnecessary data type conversions for DeepSeek-V3 on Blackwell (#9834)

Signed-off-by: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>
This commit is contained in:
Jinyang Yuan
2025-09-06 14:06:46 +08:00
committed by GitHub
parent 90dfe3de4c
commit 012584ecd5
2 changed files with 13 additions and 4 deletions

View File

@@ -655,7 +655,8 @@ def _set_envs_and_config(server_args: ServerArgs):
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
# flashinfer uses this environment variable for various kernels from MoE to quant kernels
os.environ["TRTLLM_ENABLE_PDL"] = "1"
if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
os.environ["TRTLLM_ENABLE_PDL"] = "1"
# Can also be passed as argument
os.environ["SGLANG_RUN_ID"] = (