Fix random dataset (#671)

2024-07-20 01:57:43 -07:00
parent 35759efa91
commit 9592a1f3bd
1 changed files with 93 additions and 35 deletions
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -192,6 +192,36 @@ class BenchmarkMetrics:
    p99_itl_ms: float
 default_sharegpt_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
 def download_sharegpt_dataset(path):
    url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
    print(f"Downloading dataset from {url}")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        total_size = int(response.headers.get("content-length", 0))
        block_size = 8192
        with open(path, "wb") as f, tqdm(
            desc="Downloading",
            total=total_size,
            unit="iB",
            unit_scale=True,
            unit_divisor=1024,
        ) as progress_bar:
            for data in response.iter_content(block_size):
                size = f.write(data)
                progress_bar.update(size)
        print(f"Dataset downloaded and saved to {path}")
    except requests.RequestException as e:
        raise Exception(f"Failed to download dataset: {e}")
 def sample_sharegpt_requests(
    dataset_path: str,
    num_requests: int,
@@ -201,36 +231,13 @@ def sample_sharegpt_requests(
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
-    default_dataset_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
+    # Download sharegpt if necessary
-    url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+    if not os.path.isfile(dataset_path) and not os.path.isfile(default_sharegpt_path):
-
+        download_sharegpt_dataset(default_sharegpt_path)
-    if not os.path.isfile(dataset_path) and not os.path.isfile(default_dataset_path):
+        dataset_path = default_sharegpt_path
        print(f"Downloading dataset from {url}")
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            total_size = int(response.headers.get("content-length", 0))
            block_size = 8192
            with open(default_dataset_path, "wb") as f, tqdm(
                desc="Downloading",
                total=total_size,
                unit="iB",
                unit_scale=True,
                unit_divisor=1024,
            ) as progress_bar:
                for data in response.iter_content(block_size):
                    size = f.write(data)
                    progress_bar.update(size)
            print(f"Dataset downloaded and saved to {default_dataset_path}")
            dataset_path = default_dataset_path
        except requests.RequestException as e:
            raise Exception(f"Failed to download dataset: {e}")
    else:
        dataset_path = (
-            dataset_path if os.path.isfile(dataset_path) else default_dataset_path
+            dataset_path if os.path.isfile(dataset_path) else default_sharegpt_path
        )
    # Load the dataset.
@@ -279,6 +286,7 @@ def sample_random_requests(
    num_prompts: int,
    range_ratio: float,
    tokenizer: PreTrainedTokenizerBase,
    dataset_path: str,
 ) -> List[Tuple[str, int, int]]:
    input_lens = np.random.randint(
@@ -291,13 +299,62 @@ def sample_random_requests(
        output_len + 1,
        size=num_prompts,
    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+
-    input_requests = []
+    if True:
-    for i in range(num_prompts):
+        # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
-        prompt = tokenizer.decode(
+
-            [(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])]
+        # Download sharegpt if necessary
-        )
+        if not os.path.isfile(dataset_path) and not os.path.isfile(
-        input_requests.append((prompt, int(input_lens[i]), int(output_lens[i])))
+            default_sharegpt_path
        ):
            download_sharegpt_dataset(default_sharegpt_path)
            dataset_path = default_sharegpt_path
        else:
            dataset_path = (
                dataset_path if os.path.isfile(dataset_path) else default_sharegpt_path
            )
        # Load the dataset.
        with open(dataset_path) as f:
            dataset = json.load(f)
        # Filter out the conversations with less than 2 turns.
        dataset = [data for data in dataset if len(data["conversations"]) >= 2]
        # Only keep the first two turns of each conversation.
        dataset = [
            (data["conversations"][0]["value"], data["conversations"][1]["value"])
            for data in dataset
        ]
        # Shuffle the dataset.
        random.shuffle(dataset)
        # Filter out sequences that are too long or too short
        input_requests: List[Tuple[str, int, int]] = []
        for i in range(num_prompts):
            # Tokenize the prompts and completions.
            prompt = dataset[i][0]
            prompt_token_ids = tokenizer(prompt).input_ids
            prompt_len = len(prompt_token_ids)
            if prompt_len <= input_lens[i]:
                input_ids = prompt_token_ids[: input_lens[i]]
            else:
                ratio = (input_lens[i] + prompt_len - 1) // prompt_len
                input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
            prompt = tokenizer.decode(input_ids)
            input_requests.append((prompt, int(input_lens[i]), int(output_lens[i])))
    else:
        # Sample token ids from random integers. This can cause some NaN issues.
        offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
        input_requests = []
        for i in range(num_prompts):
            prompt = tokenizer.decode(
                [
                    (offsets[i] + i + j) % tokenizer.vocab_size
                    for j in range(input_lens[i])
                ]
            )
            input_requests.append((prompt, int(input_lens[i]), int(output_lens[i])))
    print(f"#Input tokens: {np.sum(input_lens)}")
    print(f"#Output tokens: {np.sum(output_lens)}")
@@ -575,6 +632,7 @@ def fire(args: argparse.Namespace):
            num_prompts=args.num_prompts,
            range_ratio=args.random_range_ratio,
            tokenizer=tokenizer,
            dataset_path=args.dataset_path,
        )
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")