Add hf3fs support for hicache storage (based on #7704) (#7280)

Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
This commit is contained in:
pansicheng
2025-07-31 08:42:41 +08:00
committed by GitHub
parent a79a5d7012
commit 299803343d
12 changed files with 1110 additions and 23 deletions

49
benchmark/hf3fs/bench.sh Normal file
View File

@@ -0,0 +1,49 @@
SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json \
python3 benchmark/hf3fs/bench_storage.py
####################################################################################################
rm -rf nohup.out && \
nohup python3 -m sglang.launch_server \
--model-path /code/models/Qwen3-32B/ \
--host 0.0.0.0 --port 33301 \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 --hicache-size 0 \
--hicache-write-policy write_through \
--hicache-storage-backend hf3fs &
rm -rf bench_multiturn.out && \
nohup python3 benchmark/hicache/bench_multiturn.py \
--model-path /code/models/Qwen3-32B \
--dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
--port 33301 \
--request-length 2048 --num-clients 512 --num-rounds 3 --max-parallel 8 \
> bench_multiturn.out &
####################################################################################################
rm -rf nohup.out && \
nohup python3 -m sglang.launch_server \
--model-path /code/models/DeepSeek-R1/ \
--tp 16 --nnodes 2 --node-rank 0 \
--dist-init-addr 10.74.249.153:5000 \
--host 0.0.0.0 --port 33301 \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 --hicache-size 60 \
--hicache-write-policy write_through \
--hicache-storage-backend hf3fs &
rm -rf bench_multiturn.out && \
nohup python3 benchmark/hicache/bench_multiturn.py \
--model-path /code/models/Qwen3-32B \
--dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
--port 33301 \
--request-length 2048 --num-clients 1024 --num-rounds 3 --max-parallel 8 \
> bench_multiturn.out &
####################################################################################################
ps aux | grep "sglang.launch_server" | grep -v grep | awk '{print $2}' | xargs kill -9
ps aux | grep "bench_multiturn.py" | grep -v grep | awk '{print $2}' | xargs kill -9

View File

@@ -0,0 +1,162 @@
import concurrent.futures
import logging
import random
import time
from typing import List
import torch
from tqdm import tqdm
from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
def print_stats(x: List[int]):
x = sorted(x)
lenx = len(x)
print(
f"mean = {sum(x)/len(x):.2f}, "
f"min = {min(x):.2f}, "
f"p25 = {x[int(lenx*0.25)]:.2f}, "
f"p50 = {x[int(lenx*0.5)]:.2f}, "
f"p75 = {x[int(lenx*0.75)]:.2f}, "
f"max = {max(x):.2f}"
)
def test():
# /path/to/hf3fs
file_path = "/data/bench.bin"
file_size = 1 << 40
bytes_per_page = 16 << 20
entries = 32
file_ops = Hf3fsClient(file_path, file_size, bytes_per_page, entries)
print("test batch_read / batch_write")
num_pages = 128
dtype = torch.bfloat16
numel = bytes_per_page // dtype.itemsize
offsets = list(range(file_size // bytes_per_page))
random.shuffle(offsets)
offsets = offsets[:num_pages]
offsets = [i * bytes_per_page for i in offsets]
tensor_writes = [
torch.randn(numel, dtype=dtype)
for _ in tqdm(range(num_pages), desc="prepare tensor")
]
for i in tqdm(range(0, num_pages, file_ops.entries), desc="batch_write"):
results = file_ops.batch_write(
offsets[i : i + file_ops.entries], tensor_writes[i : i + file_ops.entries]
)
assert all([result == numel * dtype.itemsize for result in results])
tensor_reads = [
torch.empty(numel, dtype=dtype)
for _ in tqdm(range(num_pages), desc="prepare tensor")
]
for i in tqdm(range(0, num_pages, file_ops.entries), desc="batch_read"):
results = file_ops.batch_read(
offsets[i : i + file_ops.entries], tensor_reads[i : i + file_ops.entries]
)
assert all([result == numel * dtype.itemsize for result in results])
assert all([torch.allclose(r, w) for r, w in zip(tensor_reads, tensor_writes)])
file_ops.close()
print("test done")
def bench():
file_path = "/data/bench.bin"
file_size = 1 << 40
bytes_per_page = 16 << 20
entries = 8
numjobs = 16
dtype = torch.bfloat16
numel = bytes_per_page // dtype.itemsize
file_ops = [
Hf3fsClient(file_path, file_size, bytes_per_page, entries)
for _ in range(numjobs)
]
num_page = entries
offsets = list(range(file_size // bytes_per_page))
tensors_write = [torch.randn(numel, dtype=dtype)] * num_page
tensors_read = [torch.empty(numel, dtype=dtype)] * num_page
random.shuffle(offsets)
warmup = 50
iteration = 100
executor = concurrent.futures.ThreadPoolExecutor(max_workers=numjobs)
w_bw = []
w_size = num_page * numjobs * bytes_per_page / (1 << 30)
for i in tqdm(range(warmup + iteration), desc="Benchmarking write (GB/s)"):
_offsets = [
[
offset * bytes_per_page
for offset in offsets[
(i * numjobs + j) * num_page : (i * numjobs + j + 1) * num_page
]
]
for j in range(numjobs)
]
tik = time.perf_counter()
futures = [
executor.submit(file_ops[j].batch_write, offset, tensors_write)
for j, offset in enumerate(_offsets)
]
results = [future.result() for future in futures]
tok = time.perf_counter()
if i < warmup:
continue
w_bw.append(w_size / (tok - tik))
results = [
_result == bytes_per_page for result in results for _result in result
]
assert all(results)
print_stats(w_bw)
r_bw = []
r_size = w_size
for i in tqdm(range(warmup + iteration), desc="Benchmarking read (GB/s)"):
_offsets = [
[
offset * bytes_per_page
for offset in offsets[
(i * numjobs + j) * num_page : (i * numjobs + j + 1) * num_page
]
]
for j in range(numjobs)
]
tik = time.perf_counter()
futures = [
executor.submit(file_ops[j].batch_read, offset, tensors_read)
for j, offset in enumerate(_offsets)
]
results = [future.result() for future in futures]
tok = time.perf_counter()
if i < warmup:
continue
r_bw.append(r_size / (tok - tik))
results = [
_result == bytes_per_page for result in results for _result in result
]
assert all(results)
print_stats(r_bw)
executor.shutdown(wait=True)
for _file_ops in file_ops:
_file_ops.close()
print("bench done")
def main():
logging.basicConfig(level=logging.INFO)
test()
bench()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,241 @@
import json
import logging
import os
import random
import time
from typing import List
import torch
from tqdm import tqdm
from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import HiCacheHF3FS
def print_stats(x: List[int]):
x = sorted(x)
lenx = len(x)
print(
f"mean = {sum(x)/len(x):.2f}, "
f"min = {min(x):.2f}, "
f"p25 = {x[int(lenx*0.25)]:.2f}, "
f"p50 = {x[int(lenx*0.5)]:.2f}, "
f"p75 = {x[int(lenx*0.75)]:.2f}, "
f"max = {max(x):.2f}"
)
def test():
# Qwen3-32B
layer_num = 64
head_num, head_dim = 8, 128
kv_lora_rank, qk_rope_head_dim = 0, 0
store_dtype = torch.bfloat16
tokens_per_page = 64
file_path_prefix = "/data/test"
file_size = 128 << 20
numjobs = 16
bytes_per_page = 16 << 20
entries = 2
dtype = store_dtype
config_path = os.getenv(HiCacheHF3FS.default_env_var)
assert config_path
try:
with open(config_path, "w") as f:
json.dump(
{
"file_path_prefix": file_path_prefix,
"file_size": file_size,
"numjobs": numjobs,
"entries": entries,
},
f,
)
except Exception as e:
raise RuntimeError(f"Failed to dump config to {config_path}: {str(e)}")
rank = 0
hicache_hf3fs = HiCacheHF3FS.from_env_config(rank, bytes_per_page, dtype)
numel = 2 * tokens_per_page * layer_num * head_num * head_dim
assert numel * dtype.itemsize == bytes_per_page
num_pages = 10
tensors = {}
for i in range(num_pages):
k = f"key_{i}"
v = torch.randn((numel,)).to(dtype=dtype)
ok = hicache_hf3fs.set(k, v)
assert ok, f"Failed to insert {k}"
tensors[k] = v
assert hicache_hf3fs.get("key_0") is None
assert hicache_hf3fs.get("key_1") is None
start = num_pages - hicache_hf3fs.num_pages
for i in range(start, start + hicache_hf3fs.num_pages):
k = f"key_{i}"
assert hicache_hf3fs.exists(k)
out = hicache_hf3fs.get(k)
assert out is not None
v = tensors[k]
assert torch.allclose(v, out, atol=1e-3), f"Tensor mismatch for {k}"
assert not hicache_hf3fs.exists("not_exists")
hicache_hf3fs.delete("key_9")
v2 = torch.randn((numel,)).to(dtype=dtype)
assert hicache_hf3fs.set("key_new", v2)
assert torch.allclose(hicache_hf3fs.get("key_new"), v2, atol=1e-3)
hicache_hf3fs.clear()
assert len(hicache_hf3fs.free_pages) == hicache_hf3fs.num_pages
# batch
num_pages = 10
tensors = {}
keys = []
values = []
for i in range(num_pages):
k = f"key_{i}"
keys.append(k)
v = torch.randn((numel,)).to(dtype=dtype)
values.append(v)
ok = hicache_hf3fs.batch_set(keys, values)
assert not ok
assert hicache_hf3fs.get("key_8") is None
assert hicache_hf3fs.get("key_9") is None
results = hicache_hf3fs.batch_get(keys[: hicache_hf3fs.num_pages])
for result, key, value in zip(
results, keys[: hicache_hf3fs.num_pages], values[: hicache_hf3fs.num_pages]
):
assert torch.allclose(value, result, atol=1e-3), f"Tensor mismatch for {key}"
hicache_hf3fs.close()
os.remove(hicache_hf3fs.file_path)
print("All test cases passed.")
def bench():
# Qwen3-32B
layer_num = 64
head_num, head_dim = 8, 128
kv_lora_rank, qk_rope_head_dim = 0, 0
store_dtype = torch.bfloat16
tokens_per_page = 64
file_path = "/data/test.bin"
file_size = 1 << 40
numjobs = 16
bytes_per_page = 16 << 20
entries = 8
dtype = store_dtype
hicache_hf3fs = HiCacheHF3FS(
file_path=file_path,
file_size=file_size,
numjobs=numjobs,
bytes_per_page=bytes_per_page,
entries=entries,
dtype=dtype,
)
numel = 2 * tokens_per_page * layer_num * head_num * head_dim
assert numel * dtype.itemsize == bytes_per_page
num_page = 128
values = [torch.randn((numel,)).to(dtype=dtype) for _ in tqdm(range(num_page))]
warmup = 50
iteration = 100
w_bw = []
w_size = num_page * bytes_per_page / (1 << 30)
for i in tqdm(range(warmup + iteration), desc="Benchmarking write (GB/s)"):
keys = [f"{j}" for j in range(i * num_page, (i + 1) * num_page)]
tik = time.perf_counter()
ok = hicache_hf3fs.batch_set(keys, values)
tok = time.perf_counter()
if i < warmup:
continue
w_bw.append(w_size / (tok - tik))
assert ok
print_stats(w_bw)
r_bw = []
r_size = num_page * bytes_per_page / (1 << 30)
for i in tqdm(range(warmup + iteration), desc="Benchmarking read (GB/s)"):
keys = random.sample(list(hicache_hf3fs.key_to_index.keys()), num_page)
tik = time.perf_counter()
results = hicache_hf3fs.batch_get(keys)
tok = time.perf_counter()
if i < warmup:
continue
r_bw.append(r_size / (tok - tik))
assert all([r is not None for r in results])
print_stats(r_bw)
hicache_hf3fs.close()
def allclose():
# Qwen3-32B
layer_num = 64
head_num, head_dim = 8, 128
kv_lora_rank, qk_rope_head_dim = 0, 0
store_dtype = torch.bfloat16
tokens_per_page = 64
file_path = "/data/test.bin"
file_size = 1 << 40
numjobs = 16
bytes_per_page = 16 << 20
entries = 8
dtype = store_dtype
hicache_hf3fs = HiCacheHF3FS(
file_path=file_path,
file_size=file_size,
numjobs=numjobs,
bytes_per_page=bytes_per_page,
entries=entries,
dtype=dtype,
)
numel = 2 * tokens_per_page * layer_num * head_num * head_dim
assert numel * dtype.itemsize == bytes_per_page
num_page = 128
values = [torch.randn((numel,)).to(dtype=dtype) for _ in tqdm(range(num_page))]
iteration = 100
for i in tqdm(range(iteration), desc="Benchmarking write (GB/s)"):
keys = [f"{j}" for j in range(i * num_page, (i + 1) * num_page)]
ok = hicache_hf3fs.batch_set(keys, values)
assert ok
read_keys, read_results = [], []
for i in tqdm(range(iteration), desc="Benchmarking read (GB/s)"):
keys = random.sample(list(hicache_hf3fs.key_to_index.keys()), num_page)
results = hicache_hf3fs.batch_get(keys)
read_keys.extend(keys)
read_results.extend(results)
assert all([r is not None for r in results])
for key, result in tqdm(zip(read_keys, read_results)):
assert torch.allclose(values[int(key) % num_page], result, atol=1e-3)
hicache_hf3fs.close()
def main():
logging.basicConfig(level=logging.INFO)
test()
bench()
allclose()
if __name__ == "__main__":
main()