Add a simple torch native attention backend (#2241)

2024-12-01 19:01:25 +08:00
parent fc78640e00
commit 62c516ac45
7 changed files with 388 additions and 26 deletions
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -256,10 +256,15 @@ class ForwardBatch:
            ret.extend_prefix_lens = torch.tensor(
                batch.extend_prefix_lens, dtype=torch.int32
            ).to(device, non_blocking=True)
-            ret.extend_num_tokens = batch.extend_num_tokens
-            ret.positions, ret.extend_start_loc = compute_position_triton(
-                ret.extend_prefix_lens, ret.extend_seq_lens, ret.extend_num_tokens
-            )
+            if model_runner.server_args.attention_backend != "torch_native":
+                ret.extend_num_tokens = batch.extend_num_tokens
+                ret.positions, ret.extend_start_loc = compute_position_triton(
+                    ret.extend_prefix_lens, ret.extend_seq_lens, ret.extend_num_tokens
+                )
+            else:
+                ret.positions, ret.extend_start_loc = compute_position_torch(
+                    ret.extend_prefix_lens, ret.extend_seq_lens
+                )
            ret.extend_prefix_lens_cpu = batch.extend_prefix_lens
            ret.extend_seq_lens_cpu = batch.extend_seq_lens
            ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -40,6 +40,7 @@ from vllm.model_executor.models import ModelRegistry
 from sglang.srt.configs.model_config import AttentionArch, ModelConfig
 from sglang.srt.layers.attention.double_sparsity_backend import DoubleSparseAttnBackend
 from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
+from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import Sampler
@@ -570,6 +571,8 @@ class ModelRunner:
                self.attn_backend = DoubleSparseAttnBackend(self)
            else:
                self.attn_backend = TritonAttnBackend(self)
+        elif self.server_args.attention_backend == "torch_native":
+            self.attn_backend = TorchNativeAttnBackend(self)
        else:
            raise ValueError(
                f"Invalid attention backend: {self.server_args.attention_backend}"