Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@@ -0,0 +1,354 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Download dataset from:
+https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json
+
+Convert to OpenAI API:
+export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
+python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
+"""
+
+import argparse
+import json
+import random
+from statistics import mean
+from typing import Any
+
+import pandas as pd  # type: ignore
+import tqdm  # type: ignore
+from transformers import AutoTokenizer  # type: ignore
+
+
+def has_non_english_chars(text: str) -> bool:
+    return not text.isascii()
+
+
+def content_is_valid(
+    content: str, min_content_len: int | None, max_content_len: int | None
+) -> bool:
+    if min_content_len and len(content) < min_content_len:
+        return False
+
+    if max_content_len and len(content) > max_content_len:
+        return False
+
+    return has_non_english_chars(content)
+
+
+def print_stats(
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
+) -> None:
+    # Collect statistics
+    stats = []
+
+    print("\nCollecting statistics...")
+    for item in tqdm.tqdm(conversations):
+        # item has "id" and "messages"
+        messages = item["messages"]
+
+        user_turns = 0
+        assistant_turns = 0
+        user_words = 0
+        assistant_words = 0
+        conv_chars = 0
+
+        user_tokens: list[int] = []
+        assistant_tokens: list[int] = []
+
+        for m in messages:
+            content = m["content"]
+            conv_chars += len(content)
+            content_num_words = content.count(" ") + 1
+
+            num_tokens = 0
+            if tokenizer:
+                num_tokens = len(tokenizer(m["content"]).input_ids)
+
+            if m["role"] == "user":
+                user_turns += 1
+                user_words += content_num_words
+                if tokenizer:
+                    user_tokens.append(num_tokens)
+
+            elif m["role"] == "assistant":
+                assistant_turns += 1
+                assistant_words += content_num_words
+                if tokenizer:
+                    assistant_tokens.append(num_tokens)
+
+        # assert user_turns == assistant_turns, \
+        # f"Invalid conversation ID {item['id']}"
+
+        conv_words = user_words + assistant_words
+        item_stats = {
+            "user_turns": user_turns,
+            "assistant_turns": assistant_turns,
+            "user_words": user_words,
+            "assistant_words": assistant_words,
+            "conv_turns": len(messages),
+            "conv_words": conv_words,
+            "conv_characters": conv_chars,
+        }
+
+        if len(user_tokens) > 0:
+            item_stats["user_tokens"] = int(mean(user_tokens))
+
+        if len(assistant_tokens) > 0:
+            item_stats["assistant_tokens"] = int(mean(assistant_tokens))
+
+        stats.append(item_stats)
+
+    print("\nStatistics:")
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
+    df = pd.DataFrame(stats)
+    print(df.describe(percentiles=percentiles).transpose())
+
+
+def convert_sharegpt_to_openai(
+    seed: int,
+    input_file: str,
+    output_file: str,
+    max_items: int | None,
+    min_content_len: int | None = None,
+    max_content_len: int | None = None,
+    min_turns: int | None = None,
+    max_turns: int | None = None,
+    model: str | None = None,
+) -> None:
+    if min_turns and max_turns:
+        assert min_turns <= max_turns
+
+    if min_content_len and max_content_len:
+        # Verify that min is not larger than max if both were given
+        assert min_content_len <= max_content_len
+
+    print(
+        f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
+        f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
+    )
+
+    random.seed(seed)
+
+    tokenizer = None
+    if model is not None:
+        print(f"Loading tokenizer from: {model}")
+        tokenizer = AutoTokenizer.from_pretrained(model)
+
+    # Read the ShareGPT JSON file
+    print(f"Reading file: {input_file}")
+    with open(input_file, encoding="utf-8") as f:
+        # Should be a list of dicts
+        # Each dict should have "id" (string) and "conversations" (list of dicts)
+        sharegpt_data = json.load(f)
+
+    assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"
+
+    print(f"Total items in input file: {len(sharegpt_data):,}")
+
+    print(f"Shuffling dataset with seed {seed}")
+    random.shuffle(sharegpt_data)
+
+    # Map conversation ID to the all the messages
+    conversation_parts: dict[str, list[Any]] = {}
+
+    for item in tqdm.tqdm(sharegpt_data):
+        assert "id" in item, "Missing key 'id'"
+        assert "conversations" in item, "Missing key 'conversations'"
+
+        # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
+        conv_id, _ = item["id"].split("_")
+        new_turns = item["conversations"]
+
+        if conv_id not in conversation_parts:
+            # Start new conversation
+            conversation_parts[conv_id] = []
+        elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
+            prev_turns = conversation_parts[conv_id][-1]
+            if prev_turns[-1]["from"] == new_turns[0]["from"]:
+                new_turns = new_turns[1:]
+
+        if len(new_turns) > 0:
+            # We assume that parts are in order in the ShareGPT dataset
+            conversation_parts[conv_id].append(new_turns)
+
+    dataset: list[dict[str, Any]] = []
+    for conv_id, conv_parts in conversation_parts.items():
+        new_item = {"id": conv_id}
+
+        conversations: list[dict[str, str]] = []
+
+        # Merge all parts
+        for conv_part in conv_parts:
+            conversations.extend(conv_part)
+
+        if len(conversations) > 0:
+            new_item["conversations"] = conversations
+            dataset.append(new_item)
+
+    print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")
+
+    # Final output data
+    final_openai_dataset: list[dict] = []
+
+    # Filter conversations from the ShareGPT dataset and convert to OpenAI format
+    for item in tqdm.tqdm(dataset):
+        messages: list[dict] = []
+
+        assert "id" in item, "Missing key 'id'"
+        assert "conversations" in item, "Missing key 'conversations'"
+
+        conv_id = item["id"]
+        conversations = item["conversations"]
+
+        if min_turns is not None and len(conversations) < min_turns:
+            # Skip short conversations
+            continue
+
+        # Convert each message in the conversation, up to max_turns if specified
+        for i, turn in enumerate(conversations):
+            assert "from" in turn and "value" in turn, (
+                f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
+            )
+
+            role = None
+            turn_from = turn["from"]
+
+            if turn_from in {"human", "user"}:
+                role = "user"
+            elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
+                role = "assistant"
+            elif turn_from == "system":
+                role = "system"
+
+            assert role is not None, (
+                f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
+            )
+
+            if i == 0 and role != "user":
+                # If the first message is from assistant (gpt), skip it.
+                # this happens when the conversation is a follow-up
+                # to a previous conversation (from the same user).
+                continue
+
+            if max_turns is not None and i >= max_turns:
+                break
+
+            # Convert message to OpenAI format (with "role" and "content")
+            content = turn["value"]
+            messages.append({"role": role, "content": content})
+
+        # Add the converted conversation to the OpenAI format
+        if len(messages) > 0:
+            valid_messages = True
+
+            # First turn should always be from the user
+            user_turn = True
+
+            for m in messages:
+                # Make sure that turns alternate between user and assistant
+                if (user_turn and m["role"] != "user") or (
+                    not user_turn and m["role"] != "assistant"
+                ):
+                    valid_messages = False
+                    break
+
+                user_turn = not user_turn
+
+                content = m["content"]
+                valid_messages = content_is_valid(
+                    content, min_content_len, max_content_len
+                )
+                if not valid_messages:
+                    break
+
+            if valid_messages is True:
+                final_openai_dataset.append({"id": conv_id, "messages": messages})
+
+    assert len(final_openai_dataset) > 0, "Final number of conversations is zero"
+
+    print_stats(final_openai_dataset)
+
+    print_stats_again = False
+    if max_items is not None and len(final_openai_dataset) > max_items:
+        print(f"\n\nSampling {max_items} items from the dataset...")
+        print_stats_again = True
+        final_openai_dataset = random.sample(final_openai_dataset, max_items)
+
+    if print_stats_again:
+        # Print stats after the dataset changed
+        print_stats(final_openai_dataset, tokenizer)
+
+    # Write the converted data to a new JSON file
+    final_size = len(final_openai_dataset)
+    print(f"\nTotal conversations converted (after filtering): {final_size:,}")
+    print(f"\nWriting file: {output_file}")
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert ShareGPT dataset to OpenAI API format"
+    )
+    parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
+    parser.add_argument(
+        "output_file", help="Path to the output OpenAI format JSON file"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, help="Seed for random number generators"
+    )
+    parser.add_argument(
+        "--max-items",
+        type=int,
+        default=None,
+        help="Maximum number of items in the output file",
+    )
+    parser.add_argument(
+        "--min-turns",
+        type=int,
+        default=None,
+        help="Minimum number of turns per conversation",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=None,
+        help="Maximum number of turns per conversation",
+    )
+    parser.add_argument(
+        "--min-content-len",
+        type=int,
+        default=None,
+        help="Min number of characters in the messages' content",
+    )
+    parser.add_argument(
+        "--max-content-len",
+        type=int,
+        default=None,
+        help="Max number of characters in the messages' content",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="LLM model, only the tokenizer will be used",
+    )
+
+    args = parser.parse_args()
+
+    convert_sharegpt_to_openai(
+        args.seed,
+        args.input_file,
+        args.output_file,
+        args.max_items,
+        args.min_content_len,
+        args.max_content_len,
+        args.min_turns,
+        args.max_turns,
+        args.model,
+    )
+
+
+if __name__ == "__main__":
+    main()