[Feat] QWen-1M context support[2/2]: Update block sparse attention backend (#5949)
This commit is contained in:
74
examples/runtime/engine/offline_batch_inference_qwen_1m.py
Normal file
74
examples/runtime/engine/offline_batch_inference_qwen_1m.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Usage:
|
||||
python3 offline_batch_inference.py
|
||||
"""
|
||||
|
||||
from urllib.request import urlopen
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
def load_prompt() -> str:
|
||||
# Test cases with various lengths can be found at:
|
||||
#
|
||||
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
|
||||
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
|
||||
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
|
||||
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
|
||||
|
||||
with urlopen(
|
||||
"https://qianwen-res.oss-cn-beijing.aliyuncs.com"
|
||||
"/Qwen2.5-1M/test-data/64k.txt",
|
||||
timeout=5,
|
||||
) as response:
|
||||
prompt = response.read().decode("utf-8")
|
||||
return prompt
|
||||
|
||||
|
||||
# Processing the prompt.
|
||||
def process_requests(llm: sgl.Engine, prompts: list[str]) -> None:
|
||||
# Create a sampling params object.
|
||||
sampling_params = {
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.8,
|
||||
"top_k": 20,
|
||||
"repetition_penalty": 1.05,
|
||||
"max_new_tokens": 256,
|
||||
}
|
||||
# Generate texts from the prompts.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt_token_ids = output["meta_info"]["prompt_tokens"]
|
||||
generated_text = output["text"]
|
||||
print(
|
||||
f"Prompt length: {prompt_token_ids}, " f"Generated text: {generated_text!r}"
|
||||
)
|
||||
|
||||
|
||||
# Create an LLM.
|
||||
def initialize_engine() -> sgl.Engine:
|
||||
llm = sgl.Engine(
|
||||
model_path="Qwen/Qwen2.5-7B-Instruct-1M",
|
||||
context_length=1048576,
|
||||
page_size=256,
|
||||
attention_backend="dual_chunk_flash_attn",
|
||||
tp_size=4,
|
||||
disable_radix_cache=True,
|
||||
enable_mixed_chunk=False,
|
||||
enable_torch_compile=False,
|
||||
chunked_prefill_size=131072,
|
||||
mem_fraction_static=0.6,
|
||||
log_level="DEBUG",
|
||||
)
|
||||
return llm
|
||||
|
||||
|
||||
def main():
|
||||
llm = initialize_engine()
|
||||
prompt = load_prompt()
|
||||
process_requests(llm, [prompt])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user