From b614823125a346a5e7f12fea005a3b6661843da9 Mon Sep 17 00:00:00 2001
From: zhaoyingzhuo <zhao_yingzhuo@foxmail.com>
Date: Wed, 10 Dec 2025 15:52:23 +0800
Subject: [PATCH] [chore] Remove obsolete comments

---
 vllm_kunlun/compilation/wrapper.py               |  7 +------
 vllm_kunlun/models/interns1_vit.py               |  1 -
 vllm_kunlun/ops/_kunlun_ops.py                   | 16 ----------------
 .../ops/attention/backends/kunlun_attn.py        |  9 ---------
 vllm_kunlun/ops/sample/sampler.py                |  5 -----
 vllm_kunlun/utils.py                             |  1 -
 6 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/vllm_kunlun/compilation/wrapper.py b/vllm_kunlun/compilation/wrapper.py
index 2de4fa7..73f5d12 100644
--- a/vllm_kunlun/compilation/wrapper.py
+++ b/vllm_kunlun/compilation/wrapper.py
@@ -126,12 +126,7 @@ class TorchCompileWrapperWithCustomDispatcher:
                                  decompiled_file)
                 except Exception:
                     pass
-        # if self.vllm_config.compilation_config.use_cudagraph and \
-        #     "update" in new_code.co_names:
-        #     import depyf
-        #     src = depyf.decompile(new_code)
-        #     msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
-        #     raise RuntimeError(msg)
+
 
     @contextmanager
     def dispatch_to_code(self, index: int):
diff --git a/vllm_kunlun/models/interns1_vit.py b/vllm_kunlun/models/interns1_vit.py
index f8a4868..bfdc6f9 100644
--- a/vllm_kunlun/models/interns1_vit.py
+++ b/vllm_kunlun/models/interns1_vit.py
@@ -253,7 +253,6 @@ class InternS1VisionMLP(nn.Module):
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        # self.activation_fn = GeluAndMul()
         self.fc1 = ColumnParallelLinear(config.hidden_size,
                                         config.intermediate_size,
                                         bias=True,
diff --git a/vllm_kunlun/ops/_kunlun_ops.py b/vllm_kunlun/ops/_kunlun_ops.py
index 1880a50..57515e1 100644
--- a/vllm_kunlun/ops/_kunlun_ops.py
+++ b/vllm_kunlun/ops/_kunlun_ops.py
@@ -73,7 +73,6 @@ class KunlunOps:
         alibi_sqrt=False,
     ):
         """PagedAttentionV1"""
-        # block_size = value_cache.shape[2]
         xtorch_ops.paged_attention(
             x=query,
             k_cache=key_cache,
@@ -116,7 +115,6 @@ class KunlunOps:
         alibi_sqrt=False,
     ):
         """PagedAttentionV2"""
-        # block_size = value_cache.shape[2]
         xtorch_ops.paged_attention(
             x=query,
             k_cache=key_cache,
@@ -221,17 +219,6 @@ class KunlunOps:
         num_heads = query_x.shape[1] // head_size
         num_kv_heads = key_x.shape[1] // head_size
 
-        # # [num_tokens, num_heads * head_size] -> [num_tokens, num_heads, head_size]
-        # query_x = query_x.view(num_tokens, num_heads, head_size)
-        # # [num_tokens, num_kv_heads * head_size] -> [num_tokens, num_kv_heads, head_size]
-        # key_x = key_x.view(num_tokens, num_kv_heads, head_size)
-
-        # # Ensure shapes are correct
-        # assert query_x.shape == (num_tokens, num_heads, head_size), \
-        #     f"Expected query shape [{num_tokens}, {num_heads}, {head_size}], got {query_x.shape}"
-        # assert key_x.shape == (num_tokens, num_kv_heads, head_size), \
-        #     f"Expected key shape [{num_tokens}, {num_kv_heads}, {head_size}], got {key_x.shape}"
-
         torch.ops._C.rotary_embedding(
             positions, query_x, key_x, head_size, cos_sin_cache, is_neox_style
         )
@@ -239,8 +226,6 @@ class KunlunOps:
         query_x = query_x.view(num_tokens, num_heads * head_size)
         key_x = key_x.view(num_tokens, num_kv_heads * head_size)
 
-        # query.data = query_x
-        # key.data  = key_x
         return query_x, key_x
 
     # Rotary embedding
@@ -290,7 +275,6 @@ class KunlunOps:
         kv_cache_dtype,
     ):
         """reshape_and_cache"""
-        # slot_mapping_cast = slot_mapping.to(torch.int32)
         xtorch_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
 
     @staticmethod
diff --git a/vllm_kunlun/ops/attention/backends/kunlun_attn.py b/vllm_kunlun/ops/attention/backends/kunlun_attn.py
index ebb0e4f..267073c 100644
--- a/vllm_kunlun/ops/attention/backends/kunlun_attn.py
+++ b/vllm_kunlun/ops/attention/backends/kunlun_attn.py
@@ -271,13 +271,6 @@ class KunlunMetadata(AttentionMetadata, PagedAttentionMetadata):
             if self.context_lens_tensor is None
             else self.context_lens_tensor[: self.num_prefills]
         )
-        # for prefix cache, block table only contains blocks that hit
-        # if self.block_tables is None:
-        #     block_tables = None
-        # elif self.block_tables.shape[1] == 0:
-        #     block_tables = self.block_tables[:self.num_prefills]
-        # else:
-        #     block_tables = self.block_tables[:self.num_prefills][:, -1].clone()
 
         block_tables = (
             None
@@ -442,7 +435,6 @@ class KunlunMetadataBuilder(CommonMetadataBuilder[KunlunMetadata]):
             if inter_data.prefix_cache_hit:
                 assert context_len != 0
                 assert context_len % self.block_size == 0
-                # block_table = block_tables[seq_id]
                 block_table = block_tables[seq_id][: context_len // self.block_size]
             elif (not is_prompt) and block_tables is not None:
                 if curr_sliding_window_block == 0:
@@ -483,7 +475,6 @@ class KunlunMetadataBuilder(CommonMetadataBuilder[KunlunMetadata]):
             query_start_loc, dtype=torch.int32, device="cpu"
         )
         attn_meta.query_start_loc_host = query_start_loc_host
-        # max_kv_len = max(query_lens + prefix_cache_kv_lens)
         attn_meta.max_kv_len = max(self.prefix_cache_kv_lens + attn_meta.seq_lens)
 
         # If kv cache is included and there is a hit
diff --git a/vllm_kunlun/ops/sample/sampler.py b/vllm_kunlun/ops/sample/sampler.py
index b22c84c..7bada08 100644
--- a/vllm_kunlun/ops/sample/sampler.py
+++ b/vllm_kunlun/ops/sample/sampler.py
@@ -516,10 +516,6 @@ def _apply_top_k_top_p(
     top_p_mask[:, -1] = False
     logits_sort.masked_fill_(top_p_mask, -float("inf"))
 
-    # Re-sort the probabilities.
-    # logits = torch.empty_like(logits_sort).scatter_(dim=-1,
-    #                                                 index=logits_idx,
-    #                                                 src=logits_sort)
     return logits_sort, logits_idx
 
 
@@ -883,7 +879,6 @@ def _sample_with_torch(
                     seq_groups=seq_groups_arg,
                 )
                 if logits_idx is not None:
-                    # multinomial_samples[sampling_type] = logits_idx[:, result_idx[:][0]]
                     token_ids = logits_idx[long_sample_indices].gather(
                         dim=1, index=result_idx.to(logits_idx.device)
                     )
diff --git a/vllm_kunlun/utils.py b/vllm_kunlun/utils.py
index b21a786..6bc6778 100644
--- a/vllm_kunlun/utils.py
+++ b/vllm_kunlun/utils.py
@@ -200,7 +200,6 @@ class ModuleLoggingHookPost(object):
         self.name_list.pop()
 
 
-# if os.environ.get("ENABLE_VLLM_MODULE_HOOK", "0") == "1":
 if xenvs.ENABLE_VLLM_MODULE_HOOK:
     from torch.nn.modules.module import (
         register_module_forward_pre_hook,