[1/2] Speed up trtllm_mla attention backend (>10% e2e) (#10473)

This commit is contained in:
fzyzcjy
2025-09-16 02:53:21 +08:00
committed by GitHub
parent 5c08d7d21d
commit 3b25dc127a
6 changed files with 119 additions and 3 deletions

View File

@@ -67,11 +67,8 @@ ALL_MODELS = [
ModelCase("openai-community/gpt2"),
ModelCase("microsoft/phi-1_5", trust_remote_code=True),
ModelCase("adept/persimmon-8b-chat"),
ModelCase("upstage/SOLAR-10.7B-Instruct-v1.0"),
ModelCase("inclusionAI/Ling-lite", trust_remote_code=True),
ModelCase("microsoft/Phi-3-small-8k-instruct", trust_remote_code=True),
ModelCase("allenai/OLMo-2-1124-7B-Instruct", skip_long_prompt=True),
ModelCase("ibm-granite/granite-3.0-2b-instruct", skip_long_prompt=True),