[Feature] Initial support for multi-LoRA serving (#1307)

This commit is contained in:
Ying Sheng
2024-09-12 16:46:14 -07:00
committed by GitHub
parent c33d82a211
commit 712216928f
21 changed files with 1435 additions and 22 deletions

View File

@@ -0,0 +1,52 @@
"""
used for debug using tensor comparison
dump {name: tensor} into "log_hf.jsonl" and "log_srt.jsonl"
use the same name for two tensors that supposed to be close
recommend name like: "layer 2 after mlp"
"""
import json
import sys
import torch
if len(sys.argv) > 1:
assert sys.argv[1] == "base"
hf_log = "base_log_hf.jsonl"
srt_log = "base_log_srt.jsonl"
else:
hf_log = "log_hf.jsonl"
srt_log = "log_srt.jsonl"
def load_data(filepath):
tensors = {}
with open(filepath, "r") as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
for k, v in data.items():
tensors[k] = torch.tensor(v)
return tensors
hf_tensors = load_data(hf_log)
srt_tensors = load_data(srt_log)
def get_diff(t1, t2):
t1 = t1.reshape(t2.shape)
max_diff = torch.max(abs(t1.reshape(t2.shape) - t2))
l2_dis = torch.dist(t1, t2, p=2)
return l2_dis, max_diff
for k, _ in srt_tensors.items():
l2_dis, max_diff = get_diff(hf_tensors[k], srt_tensors[k])
print(f"{k} {l2_dis=} {max_diff=}")
if k == "layer 1 attn":
print(hf_tensors[k])
print(srt_tensors[k])
if k == "layer 0 prefill k":
print(srt_tensors[k].shape)
print(hf_tensors[k].shape)