diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6ed6046ee..7fd91a5e9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -37,7 +37,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -60,7 +60,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -84,7 +84,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -121,7 +121,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -165,7 +165,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -196,7 +196,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -234,7 +234,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh @@ -258,7 +258,7 @@ jobs: - name: Install dependencies env: - FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} + FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }} run: | bash scripts/ci_install_dependency.sh diff --git a/python/pyproject.toml b/python/pyproject.toml index cf997fc96..c600ffc0d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -26,7 +26,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", "cuda-python", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1", - "flashinfer==0.1.6", "outlines>=0.0.44,<0.1.0" + "flashinfer_python>=0.2.0.post2", "outlines>=0.0.44,<0.1.0" ] # HIP (Heterogeneous-computing Interface for Portability) for AMD diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 098a3d1e3..7f01e312c 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -316,8 +316,8 @@ def _set_envs_and_config(server_args: ServerArgs): # Check flashinfer version if server_args.attention_backend == "flashinfer": assert_pkg_version( - "flashinfer", - "0.1.6", + "flashinfer_python", + "0.2.0.post2", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index 863cb031d..1f701f946 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -149,6 +149,7 @@ class FlashInferAttnBackend(AttentionBackend): BatchPrefillWithPagedKVCacheWrapper( self.workspace_buffer, "NHD", + backend="fa2", ) ) self.prefill_wrappers_verify.append( @@ -313,7 +314,7 @@ class FlashInferAttnBackend(AttentionBackend): paged_kv_indices_buf=self.cuda_graph_kv_indices[i], paged_kv_last_page_len_buf=self.kv_last_page_len[:bs], custom_mask_buf=self.cuda_graph_custom_mask, - qk_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1], + mask_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1], ) ) seq_lens_sum = seq_lens.sum().item() @@ -1155,41 +1156,24 @@ def fast_decode_plan( self.last_page_len = torch.ones(32768, dtype=torch.int32) empty_q_data = self.empty_q_data empty_kv_cache = self.empty_kv_cache - if self.use_tensor_cores: - if not self.is_cuda_graph_enabled: - # when not using cudagraph, we need to create the indptr buffer, otherwise - # the buffer is already created during initialization - self._qo_indptr_buf = torch.arange( - batch_size + 1, dtype=torch.int32, device=indptr.device - ) - self._wrapper.plan( - self._float_workspace_buffer, - self._int_workspace_buffer, - self._qo_indptr_buf, - indptr, - batch_size, - num_qo_heads, - num_kv_heads, - head_dim, - page_size, - empty_q_data, - ) - else: - self._wrapper.plan( - self._float_workspace_buffer, - self._int_workspace_buffer, - indptr, - self.last_page_len, - batch_size, - num_qo_heads, - num_kv_heads, - head_dim, - page_size, - PosEncodingMode[pos_encoding_mode].value, - logits_soft_cap, - empty_q_data, - empty_kv_cache, - ) + stream = torch.cuda.current_stream() + self._cached_module.plan( + self._float_workspace_buffer, + self._int_workspace_buffer, + self._pin_memory_int_workspace_buffer, + indptr.to("cpu"), + batch_size, + num_qo_heads, + num_kv_heads, + page_size, + self.is_cuda_graph_enabled, + window_left, + logits_soft_cap, + head_dim, + empty_q_data, + empty_kv_cache, + stream.cuda_stream, + ) self._pos_encoding_mode = pos_encoding_mode self._window_left = window_left self._logits_soft_cap = logits_soft_cap diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py index 0b8c99f04..4abcba955 100644 --- a/python/sglang/srt/speculative/eagle_utils.py +++ b/python/sglang/srt/speculative/eagle_utils.py @@ -69,6 +69,7 @@ class EagleDraftInput: accept_length_cpu = batch.spec_info.accept_length_cpu batch.extend_lens = [x + 1 for x in accept_length_cpu] batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend + batch.req_pool_indices = batch.spec_info.req_pool_indices_for_draft_extend seq_lens_cpu = batch.seq_lens.tolist() pt = 0 @@ -353,8 +354,12 @@ class EagleVerifyInput: ] if has_finished: draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index] + draft_input.req_pool_indices_for_draft_extend = batch.req_pool_indices[ + unfinished_index + ] else: draft_input.seq_lens_for_draft_extend = batch.seq_lens + draft_input.req_pool_indices_for_draft_extend = batch.req_pool_indices logits_output.next_token_logits = logits_output.next_token_logits[accept_index] return ( diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index b5a3de6ca..6d84cc305 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -269,6 +269,7 @@ class EAGLEWorker(TpModelWorker): def forward_draft_extend_after_decode(self, batch: ScheduleBatch): seq_lens_backup = batch.seq_lens + req_pool_indices_backup = batch.req_pool_indices self._set_mem_pool(batch, self.model_runner) batch.forward_mode = ForwardMode.DRAFT_EXTEND @@ -284,6 +285,7 @@ class EAGLEWorker(TpModelWorker): # This is because `seq_lens` can be modified in `prepare_extend_after_decode` batch.forward_mode = ForwardMode.DECODE batch.seq_lens = seq_lens_backup + batch.req_pool_indices = req_pool_indices_backup def capture_for_decode( self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 1a059d5ff..ffe405d5a 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -4,16 +4,17 @@ set -euxo pipefail # Install the dependency in CI. # Use repo from environment variable, passed from GitHub Actions -FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.4/flashinfer}" +FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.5/flashinfer}" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" bash "${SCRIPT_DIR}/killall_sglang.sh" pip install --upgrade pip -pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ +pip uninstall flashinfer -y +pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/ # Force reinstall flashinfer and torch_memory_saver -pip install flashinfer==0.1.6 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps +pip install flashinfer_python==0.2.0.post2 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps pip install torch_memory_saver --force-reinstall pip install transformers==4.45.2 sentence_transformers accelerate peft diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 1fbb7f92f..039fde96a 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -52,7 +52,6 @@ suites = { "test_vision_llm.py", "test_vision_openai_server.py", "test_w8a8_quantization.py", - "test_fp8_kvcache.py", "test_fp8_kernel.py", ], "nightly": [