first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/attention/init.py
+++ b/vllm_br/attention/init.py
@@ -0,0 +1,16 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+from . import layer  # noqa: F401
--- a/vllm_br/attention/pycache/init.cpython-310.pyc
+++ b/vllm_br/attention/pycache/init.cpython-310.pyc
--- a/vllm_br/attention/pycache/layer.cpython-310.pyc
+++ b/vllm_br/attention/pycache/layer.cpython-310.pyc
--- a/vllm_br/attention/layer.py
+++ b/vllm_br/attention/layer.py
@@ -0,0 +1,130 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from typing import Optional
+
+import torch
+
+import vllm.attention.layer
+from vllm.attention.layer import (maybe_save_kv_layer_to_connector,
+                                  wait_for_kv_layer_from_connector)
+from vllm.forward_context import ForwardContext, get_forward_context
+
+#direct_register_custom_op(
+#    op_name="unified_attention",
+#    op_func=unified_attention,
+#    mutates_args=[],
+#    fake_impl=unified_attention_fake,
+#    dispatch_key=current_platform.dispatch_key,
+#)
+
+#direct_register_custom_op(
+#    op_name="unified_attention_with_output",
+#    op_func=unified_attention_with_output,
+#    mutates_args=["output"],
+#    fake_impl=unified_attention_with_output_fake,
+#    dispatch_key=current_platform.dispatch_key,
+#)
+
+
+def forward_(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    # For some alternate attention backends like MLA the attention output
+    # shape does not match the query shape, so we optionally let the model
+    # definition specify the output tensor shape.
+    output_shape: Optional[torch.Size] = None,
+) -> torch.Tensor:
+    """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+
+        Attention metadata (`attn_metadata`) is set using a context manager in
+        the model runner's `execute_model` method. It is accessed via forward
+        context using
+        `vllm.forward_context.get_forward_context().attn_metadata`.
+        """
+    if self.calculate_kv_scales:
+        attn_metadata = get_forward_context().attn_metadata
+        if attn_metadata.enable_kv_scales_calculation:
+            self.calc_kv_scales(query, key, value)
+    if self.use_output:
+        output_shape = (output_shape
+                        if output_shape is not None else query.shape)
+        output = torch.empty(output_shape,
+                             dtype=query.dtype,
+                             device=query.device)
+        hidden_size = output_shape[-1]
+        # We skip reshaping query, key and value tensors for the MLA
+        # backend since these tensors have different semantics and are
+        # processed differently.
+        if not self.use_mla:
+            # Reshape the query, key, and value tensors.
+            # NOTE(woosuk): We do this outside the custom op to minimize the
+            # CPU overheads from the non-CUDA-graph regions.
+            query = query.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size)
+            if key is not None:
+                key = key.view(-1, self.num_kv_heads, self.head_size)
+            if value is not None:
+                value = value.view(-1, self.num_kv_heads, self.head_size)
+        if self.use_direct_call:
+            forward_context: ForwardContext = get_forward_context()
+            attn_metadata = forward_context.attn_metadata
+            if isinstance(attn_metadata, dict):
+                attn_metadata = attn_metadata[self.layer_name]
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            self.impl.forward(self,
+                              query,
+                              key,
+                              value,
+                              self_kv_cache,
+                              attn_metadata,
+                              output=output)
+        else:
+            torch.ops.vllm.unified_attention_with_output(
+                query, key, value, output, self.layer_name)
+        return output.view(-1, hidden_size)
+    else:
+        if self.use_direct_call:
+            wait_for_kv_layer_from_connector(self.layer_name)
+            forward_context = get_forward_context()
+            attn_metadata = forward_context.attn_metadata
+            if isinstance(attn_metadata, dict):
+                attn_metadata = attn_metadata[self.layer_name]
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            output = self.impl.forward(self, query, key, value, self_kv_cache,
+                                       attn_metadata)
+            maybe_save_kv_layer_to_connector(self.layer_name, self_kv_cache)
+            return output
+        else:
+            # return torch.ops.vllm.unified_attention(
+            # query, key, value, self.layer_name)
+            wait_for_kv_layer_from_connector(self.layer_name)
+            forward_context = get_forward_context()
+            attn_metadata = forward_context.attn_metadata
+            if isinstance(attn_metadata, dict):
+                attn_metadata = attn_metadata[self.layer_name]
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            output = self.impl.forward(self, query, key, value, self_kv_cache,
+                                       attn_metadata)
+            maybe_save_kv_layer_to_connector(self.layer_name, self_kv_cache)
+            return output
+
+
+vllm.attention.layer.Attention.forward = forward_