Support FA3 as Attention backend by using --attention-backend fa3 (#4680)

Co-authored-by: qsong <qsong@linkedin.com>
Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
Stefan He
2025-03-23 23:28:11 -07:00
committed by GitHub
parent af6535e7aa
commit 5d7edc8e55
5 changed files with 622 additions and 1 deletions

View File

@@ -501,6 +501,7 @@ def get_dataset(args, tokenizer):
question_len=args.gsp_question_len,
output_len=args.gsp_output_len,
tokenizer=tokenizer,
args=args,
)
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
@@ -788,6 +789,7 @@ def sample_generated_shared_prefix_requests(
question_len: int,
output_len: int,
tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace,
) -> List[Tuple[str, int, int]]:
"""Generate benchmark requests with shared system prompts using random tokens and caching."""
cache_path = get_gen_prefix_cache_path(args, tokenizer)