Merge pull request #3 from xyDong0223/main

[Kernel] Enable fast random sample on Kunlun3 Platform
2025-12-11 11:47:30 +08:00
parent 0d4d4967cf af2cd6097f
commit fae22c2e62
1 changed files with 7 additions and 2 deletions
--- a/vllm_kunlun/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm_kunlun/v1/sample/ops/topk_topp_sampler.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
-
+import os
 import torch
 import torch.nn as nn
 from packaging import version
@@ -150,7 +150,12 @@ def random_sample(
    # not have its own seed. Then, we overwrite the values for the requests
    # that have their own seeds.
    if len(generators) != probs.shape[0]:
-        q.exponential_()
+        if os.getenv('FAST_RANDOM_SAMPLE') == "1":
            q.uniform_()
            q = -torch.log(q)
            q = q.clamp(min=1e-4)
        else:
            q.exponential_()
    if generators:
        # TODO(woosuk): This can be slow because we handle each request
        # one by one. Optimize this.