upgrade flashinfer v0.2.0.post2 (#3288)
Co-authored-by: pankajroark <pankajroark@users.noreply.github.com>
This commit is contained in:
16
.github/workflows/pr-test.yml
vendored
16
.github/workflows/pr-test.yml
vendored
@@ -37,7 +37,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -60,7 +60,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -84,7 +84,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -165,7 +165,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -196,7 +196,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -234,7 +234,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
@@ -258,7 +258,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
env:
|
env:
|
||||||
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
|
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
|
||||||
run: |
|
run: |
|
||||||
bash scripts/ci_install_dependency.sh
|
bash scripts/ci_install_dependency.sh
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ runtime_common = [
|
|||||||
srt = [
|
srt = [
|
||||||
"sglang[runtime_common]", "cuda-python",
|
"sglang[runtime_common]", "cuda-python",
|
||||||
"sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
|
"sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
|
||||||
"flashinfer==0.1.6", "outlines>=0.0.44,<0.1.0"
|
"flashinfer_python>=0.2.0.post2", "outlines>=0.0.44,<0.1.0"
|
||||||
]
|
]
|
||||||
|
|
||||||
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
||||||
|
|||||||
@@ -316,8 +316,8 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|||||||
# Check flashinfer version
|
# Check flashinfer version
|
||||||
if server_args.attention_backend == "flashinfer":
|
if server_args.attention_backend == "flashinfer":
|
||||||
assert_pkg_version(
|
assert_pkg_version(
|
||||||
"flashinfer",
|
"flashinfer_python",
|
||||||
"0.1.6",
|
"0.2.0.post2",
|
||||||
"Please uninstall the old version and "
|
"Please uninstall the old version and "
|
||||||
"reinstall the latest version by following the instructions "
|
"reinstall the latest version by following the instructions "
|
||||||
"at https://docs.flashinfer.ai/installation.html.",
|
"at https://docs.flashinfer.ai/installation.html.",
|
||||||
|
|||||||
@@ -149,6 +149,7 @@ class FlashInferAttnBackend(AttentionBackend):
|
|||||||
BatchPrefillWithPagedKVCacheWrapper(
|
BatchPrefillWithPagedKVCacheWrapper(
|
||||||
self.workspace_buffer,
|
self.workspace_buffer,
|
||||||
"NHD",
|
"NHD",
|
||||||
|
backend="fa2",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.prefill_wrappers_verify.append(
|
self.prefill_wrappers_verify.append(
|
||||||
@@ -313,7 +314,7 @@ class FlashInferAttnBackend(AttentionBackend):
|
|||||||
paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
|
paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
|
||||||
paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
|
paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
|
||||||
custom_mask_buf=self.cuda_graph_custom_mask,
|
custom_mask_buf=self.cuda_graph_custom_mask,
|
||||||
qk_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
|
mask_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
seq_lens_sum = seq_lens.sum().item()
|
seq_lens_sum = seq_lens.sum().item()
|
||||||
@@ -1155,41 +1156,24 @@ def fast_decode_plan(
|
|||||||
self.last_page_len = torch.ones(32768, dtype=torch.int32)
|
self.last_page_len = torch.ones(32768, dtype=torch.int32)
|
||||||
empty_q_data = self.empty_q_data
|
empty_q_data = self.empty_q_data
|
||||||
empty_kv_cache = self.empty_kv_cache
|
empty_kv_cache = self.empty_kv_cache
|
||||||
if self.use_tensor_cores:
|
stream = torch.cuda.current_stream()
|
||||||
if not self.is_cuda_graph_enabled:
|
self._cached_module.plan(
|
||||||
# when not using cudagraph, we need to create the indptr buffer, otherwise
|
self._float_workspace_buffer,
|
||||||
# the buffer is already created during initialization
|
self._int_workspace_buffer,
|
||||||
self._qo_indptr_buf = torch.arange(
|
self._pin_memory_int_workspace_buffer,
|
||||||
batch_size + 1, dtype=torch.int32, device=indptr.device
|
indptr.to("cpu"),
|
||||||
)
|
batch_size,
|
||||||
self._wrapper.plan(
|
num_qo_heads,
|
||||||
self._float_workspace_buffer,
|
num_kv_heads,
|
||||||
self._int_workspace_buffer,
|
page_size,
|
||||||
self._qo_indptr_buf,
|
self.is_cuda_graph_enabled,
|
||||||
indptr,
|
window_left,
|
||||||
batch_size,
|
logits_soft_cap,
|
||||||
num_qo_heads,
|
head_dim,
|
||||||
num_kv_heads,
|
empty_q_data,
|
||||||
head_dim,
|
empty_kv_cache,
|
||||||
page_size,
|
stream.cuda_stream,
|
||||||
empty_q_data,
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
self._wrapper.plan(
|
|
||||||
self._float_workspace_buffer,
|
|
||||||
self._int_workspace_buffer,
|
|
||||||
indptr,
|
|
||||||
self.last_page_len,
|
|
||||||
batch_size,
|
|
||||||
num_qo_heads,
|
|
||||||
num_kv_heads,
|
|
||||||
head_dim,
|
|
||||||
page_size,
|
|
||||||
PosEncodingMode[pos_encoding_mode].value,
|
|
||||||
logits_soft_cap,
|
|
||||||
empty_q_data,
|
|
||||||
empty_kv_cache,
|
|
||||||
)
|
|
||||||
self._pos_encoding_mode = pos_encoding_mode
|
self._pos_encoding_mode = pos_encoding_mode
|
||||||
self._window_left = window_left
|
self._window_left = window_left
|
||||||
self._logits_soft_cap = logits_soft_cap
|
self._logits_soft_cap = logits_soft_cap
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ class EagleDraftInput:
|
|||||||
accept_length_cpu = batch.spec_info.accept_length_cpu
|
accept_length_cpu = batch.spec_info.accept_length_cpu
|
||||||
batch.extend_lens = [x + 1 for x in accept_length_cpu]
|
batch.extend_lens = [x + 1 for x in accept_length_cpu]
|
||||||
batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
|
batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
|
||||||
|
batch.req_pool_indices = batch.spec_info.req_pool_indices_for_draft_extend
|
||||||
seq_lens_cpu = batch.seq_lens.tolist()
|
seq_lens_cpu = batch.seq_lens.tolist()
|
||||||
|
|
||||||
pt = 0
|
pt = 0
|
||||||
@@ -353,8 +354,12 @@ class EagleVerifyInput:
|
|||||||
]
|
]
|
||||||
if has_finished:
|
if has_finished:
|
||||||
draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index]
|
draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index]
|
||||||
|
draft_input.req_pool_indices_for_draft_extend = batch.req_pool_indices[
|
||||||
|
unfinished_index
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
draft_input.seq_lens_for_draft_extend = batch.seq_lens
|
draft_input.seq_lens_for_draft_extend = batch.seq_lens
|
||||||
|
draft_input.req_pool_indices_for_draft_extend = batch.req_pool_indices
|
||||||
|
|
||||||
logits_output.next_token_logits = logits_output.next_token_logits[accept_index]
|
logits_output.next_token_logits = logits_output.next_token_logits[accept_index]
|
||||||
return (
|
return (
|
||||||
|
|||||||
@@ -269,6 +269,7 @@ class EAGLEWorker(TpModelWorker):
|
|||||||
|
|
||||||
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
||||||
seq_lens_backup = batch.seq_lens
|
seq_lens_backup = batch.seq_lens
|
||||||
|
req_pool_indices_backup = batch.req_pool_indices
|
||||||
|
|
||||||
self._set_mem_pool(batch, self.model_runner)
|
self._set_mem_pool(batch, self.model_runner)
|
||||||
batch.forward_mode = ForwardMode.DRAFT_EXTEND
|
batch.forward_mode = ForwardMode.DRAFT_EXTEND
|
||||||
@@ -284,6 +285,7 @@ class EAGLEWorker(TpModelWorker):
|
|||||||
# This is because `seq_lens` can be modified in `prepare_extend_after_decode`
|
# This is because `seq_lens` can be modified in `prepare_extend_after_decode`
|
||||||
batch.forward_mode = ForwardMode.DECODE
|
batch.forward_mode = ForwardMode.DECODE
|
||||||
batch.seq_lens = seq_lens_backup
|
batch.seq_lens = seq_lens_backup
|
||||||
|
batch.req_pool_indices = req_pool_indices_backup
|
||||||
|
|
||||||
def capture_for_decode(
|
def capture_for_decode(
|
||||||
self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
|
self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
|
||||||
|
|||||||
@@ -4,16 +4,17 @@ set -euxo pipefail
|
|||||||
# Install the dependency in CI.
|
# Install the dependency in CI.
|
||||||
|
|
||||||
# Use repo from environment variable, passed from GitHub Actions
|
# Use repo from environment variable, passed from GitHub Actions
|
||||||
FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.4/flashinfer}"
|
FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.5/flashinfer}"
|
||||||
|
|
||||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
bash "${SCRIPT_DIR}/killall_sglang.sh"
|
bash "${SCRIPT_DIR}/killall_sglang.sh"
|
||||||
|
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
|
pip uninstall flashinfer -y
|
||||||
|
pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
|
||||||
|
|
||||||
# Force reinstall flashinfer and torch_memory_saver
|
# Force reinstall flashinfer and torch_memory_saver
|
||||||
pip install flashinfer==0.1.6 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
|
pip install flashinfer_python==0.2.0.post2 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
|
||||||
pip install torch_memory_saver --force-reinstall
|
pip install torch_memory_saver --force-reinstall
|
||||||
|
|
||||||
pip install transformers==4.45.2 sentence_transformers accelerate peft
|
pip install transformers==4.45.2 sentence_transformers accelerate peft
|
||||||
|
|||||||
@@ -52,7 +52,6 @@ suites = {
|
|||||||
"test_vision_llm.py",
|
"test_vision_llm.py",
|
||||||
"test_vision_openai_server.py",
|
"test_vision_openai_server.py",
|
||||||
"test_w8a8_quantization.py",
|
"test_w8a8_quantization.py",
|
||||||
"test_fp8_kvcache.py",
|
|
||||||
"test_fp8_kernel.py",
|
"test_fp8_kernel.py",
|
||||||
],
|
],
|
||||||
"nightly": [
|
"nightly": [
|
||||||
|
|||||||
Reference in New Issue
Block a user