Update benchmark scripts (#8)
This commit is contained in:
@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
|
|||||||
```
|
```
|
||||||
docker run --name tgi --rm -ti --gpus all --network host \
|
docker run --name tgi --rm -ti --gpus all --network host \
|
||||||
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
|
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
|
||||||
ghcr.io/huggingface/text-generation-inference:1.1.0 \
|
ghcr.io/huggingface/text-generation-inference:1.3.0 \
|
||||||
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
|
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
|
||||||
--max-input-length 2048 --max-total-tokens 4096 \
|
--max-input-length 2048 --max-total-tokens 4096 \
|
||||||
--port 24000
|
--port 24000
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ def main(args):
|
|||||||
out = model + context + select(choices, name="answer")
|
out = model + context + select(choices, name="answer")
|
||||||
return choices.index(out["answer"])
|
return choices.index(out["answer"])
|
||||||
|
|
||||||
|
call_select("Hello,", ["world", "earth"])
|
||||||
|
|
||||||
elif args.backend == "lmql":
|
elif args.backend == "lmql":
|
||||||
import lmql
|
import lmql
|
||||||
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
|
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
|
||||||
@@ -135,6 +137,6 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--num-shot", type=int, default=20)
|
parser.add_argument("--num-shot", type=int, default=20)
|
||||||
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
|
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
|
||||||
parser.add_argument("--num-questions", type=int, default=100)
|
parser.add_argument("--num-questions", type=int, default=200)
|
||||||
args = add_common_other_args_and_parse(parser)
|
args = add_common_other_args_and_parse(parser)
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -91,6 +91,6 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--num-shot", type=int, default=20)
|
parser.add_argument("--num-shot", type=int, default=20)
|
||||||
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
|
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
|
||||||
parser.add_argument("--num-questions", type=int, default=100)
|
parser.add_argument("--num-questions", type=int, default=200)
|
||||||
args = add_common_sglang_args_and_parse(parser)
|
args = add_common_sglang_args_and_parse(parser)
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -17,14 +17,13 @@ outlines 0.0.22
|
|||||||
|
|
||||||
### Benchmark sglang
|
### Benchmark sglang
|
||||||
|
|
||||||
Run llama-7b
|
Run Llama-7B
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||||
```
|
```
|
||||||
|
|
||||||
Run mixtral-8x7b
|
Run Mixtral-8x7B
|
||||||
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
|
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
|
python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
|
||||||
@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10
|
|||||||
|
|
||||||
### Benchmark vllm
|
### Benchmark vllm
|
||||||
|
|
||||||
Run llama-7b
|
Run Llama-7B
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
|
python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
|
||||||
@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10
|
|||||||
|
|
||||||
### Benchmark guidance
|
### Benchmark guidance
|
||||||
|
|
||||||
Run llama-7b and benchmark
|
Run Llama-7B and benchmark
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
|
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
|
||||||
```
|
```
|
||||||
@@ -105,7 +105,7 @@ def main(args):
|
|||||||
|
|
||||||
with open(args.result_file, "a") as fout:
|
with open(args.result_file, "a") as fout:
|
||||||
value = {
|
value = {
|
||||||
"task": "json_regex_decode",
|
"task": "json_decode_regex",
|
||||||
"backend": args.backend,
|
"backend": args.backend,
|
||||||
"num_gpus": 1,
|
"num_gpus": 1,
|
||||||
"latency": round(latency, 3),
|
"latency": round(latency, 3),
|
||||||
@@ -64,8 +64,6 @@ def main(args):
|
|||||||
# Run requests
|
# Run requests
|
||||||
tic = time.time()
|
tic = time.time()
|
||||||
states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
|
states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
|
||||||
for state in states:
|
|
||||||
state.sync()
|
|
||||||
latency = time.time() - tic
|
latency = time.time() - tic
|
||||||
|
|
||||||
# Compute accuracy
|
# Compute accuracy
|
||||||
@@ -80,7 +78,7 @@ def main(args):
|
|||||||
|
|
||||||
with open(args.result_file, "a") as fout:
|
with open(args.result_file, "a") as fout:
|
||||||
value = {
|
value = {
|
||||||
"task": "json_regex_decode",
|
"task": "json_decode_regex",
|
||||||
"backend": args.backend,
|
"backend": args.backend,
|
||||||
"num_gpus": 1,
|
"num_gpus": 1,
|
||||||
"latency": round(latency, 3),
|
"latency": round(latency, 3),
|
||||||
@@ -3,19 +3,6 @@
|
|||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
```
|
```
|
||||||
|
|
||||||
### Performance
|
|
||||||
|
|
||||||
- Model: Llama-2-7b-chat-hf
|
|
||||||
- `--num-prompts 2000 --request-rate 200`
|
|
||||||
- On 4 A10 (24G) GPUs
|
|
||||||
|
|
||||||
| Backend | Throughput | Latency |
|
|
||||||
| ----------- | --------------- | -------- |
|
|
||||||
| srt | 5.82 requests/s | 343.54 s |
|
|
||||||
| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
|
|
||||||
| vllm==0.2.7 | 5.02 requests/s | 398.25 s |
|
|
||||||
|
|
||||||
|
|
||||||
### SGLang
|
### SGLang
|
||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||||
@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat
|
|||||||
|
|
||||||
### vLLM
|
### vLLM
|
||||||
```
|
```
|
||||||
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
|
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 --port 21000
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
25
benchmark/llm_judge/articles.jsonl
Normal file
25
benchmark/llm_judge/articles.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -95,6 +95,9 @@ def evaluate(args, subject, dev_df, test_df):
|
|||||||
max_tokens=max_tokens, temperature=0)
|
max_tokens=max_tokens, temperature=0)
|
||||||
return out["answer"]
|
return out["answer"]
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
call_generate("Hello,", temperature=1.0, max_tokens=8)
|
||||||
|
|
||||||
elif args.backend == "lmql":
|
elif args.backend == "lmql":
|
||||||
import lmql
|
import lmql
|
||||||
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
|
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
### Benchmark sglang
|
### Benchmark sglang
|
||||||
|
|
||||||
Run llama-7b
|
Run Llama-7B
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||||
```
|
```
|
||||||
|
|
||||||
Run mixtral-8x7b
|
Run Mixtral-8x7B
|
||||||
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
|
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -27,13 +27,13 @@ python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long
|
|||||||
|
|
||||||
### Benchmark vLLM
|
### Benchmark vLLM
|
||||||
|
|
||||||
Run llama-7b
|
Run Llama-7B
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
|
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
|
||||||
```
|
```
|
||||||
|
|
||||||
Run mixtral-8x7b
|
Run Mixtral-8x7B
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
|
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
|
||||||
@@ -53,14 +53,14 @@ python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm
|
|||||||
|
|
||||||
### Benchmark guidance
|
### Benchmark guidance
|
||||||
|
|
||||||
Benchmark llama-7b(short output)
|
Benchmark Llama-7B (short output)
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1
|
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1
|
||||||
```
|
```
|
||||||
|
|
||||||
Benchmark llama-7b(long output)
|
Benchmark Llama-7B (long output)
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long
|
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long
|
||||||
```
|
```
|
||||||
@@ -99,7 +99,7 @@ def main(args):
|
|||||||
|
|
||||||
with open(args.result_file, "a") as fout:
|
with open(args.result_file, "a") as fout:
|
||||||
value = {
|
value = {
|
||||||
"task": "multi_turns",
|
"task": "multi_turn_chat",
|
||||||
"backend": args.backend,
|
"backend": args.backend,
|
||||||
"num_gpus": 1,
|
"num_gpus": 1,
|
||||||
"latency": round(latency, 3),
|
"latency": round(latency, 3),
|
||||||
@@ -21,8 +21,6 @@ def multi_turns(s, qas):
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
print(args)
|
|
||||||
|
|
||||||
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
|
|
||||||
multi_qas = gen_arguments(args, tokenizer)
|
multi_qas = gen_arguments(args, tokenizer)
|
||||||
@@ -33,8 +31,6 @@ def main(args):
|
|||||||
states = multi_turns.run_batch(
|
states = multi_turns.run_batch(
|
||||||
multi_qas, temperature=0, backend=backend, num_threads=args.parallel
|
multi_qas, temperature=0, backend=backend, num_threads=args.parallel
|
||||||
)
|
)
|
||||||
for state in states:
|
|
||||||
state.sync()
|
|
||||||
latency = time.time() - tic
|
latency = time.time() - tic
|
||||||
|
|
||||||
print(f"Latency: {latency:.3f}")
|
print(f"Latency: {latency:.3f}")
|
||||||
@@ -43,7 +39,7 @@ def main(args):
|
|||||||
|
|
||||||
with open(args.result_file, "a") as fout:
|
with open(args.result_file, "a") as fout:
|
||||||
value = {
|
value = {
|
||||||
"task": "multi_turns",
|
"task": "multi_turn_chat",
|
||||||
"backend": args.backend,
|
"backend": args.backend,
|
||||||
"num_gpus": 1,
|
"num_gpus": 1,
|
||||||
"latency": round(latency, 3),
|
"latency": round(latency, 3),
|
||||||
@@ -74,4 +70,6 @@ if __name__ == "__main__":
|
|||||||
args.min_len_a = 256
|
args.min_len_a = 256
|
||||||
args.max_len_a = 512
|
args.max_len_a = 512
|
||||||
args.num_qa = 20
|
args.num_qa = 20
|
||||||
|
|
||||||
|
print(args)
|
||||||
main(args)
|
main(args)
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
## Run benchmark
|
## Run benchmark
|
||||||
|
|
||||||
|
NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.
|
||||||
|
|
||||||
### Benchmark sglang
|
### Benchmark sglang
|
||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||||
|
|||||||
@@ -124,6 +124,9 @@ def main(args):
|
|||||||
))
|
))
|
||||||
return out["result"]
|
return out["result"]
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
call_generate("Hello,", 1.0, 8, ".")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid backend: {args.backend}")
|
raise ValueError(f"Invalid backend: {args.backend}")
|
||||||
|
|
||||||
|
|||||||
@@ -82,9 +82,10 @@ Action 3: Finish[yes]
|
|||||||
""" + question)
|
""" + question)
|
||||||
for i in range(1, len(triplets) + 2):
|
for i in range(1, len(triplets) + 2):
|
||||||
s += "Thought " + str(i) + ":"
|
s += "Thought " + str(i) + ":"
|
||||||
|
# NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
|
||||||
ss = s.fork(1)
|
ss = s.fork(1)
|
||||||
ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
|
ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
|
||||||
# ss.join()
|
ss.join()
|
||||||
# to verify the correctness of output, this should be collected
|
# to verify the correctness of output, this should be collected
|
||||||
# print(ss[0]["thought_action"])
|
# print(ss[0]["thought_action"])
|
||||||
if i > len(triplets):
|
if i > len(triplets):
|
||||||
|
|||||||
100
benchmark/react/hotpotqa_100.jsonl
Normal file
100
benchmark/react/hotpotqa_100.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -5,13 +5,15 @@ wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_sch
|
|||||||
|
|
||||||
## Run benchmark
|
## Run benchmark
|
||||||
|
|
||||||
|
NOTE: This is an implementation for throughput/latency benchmark purposes. The prompts are not tuned to achieve good accuracy on the GSM-8K tasks.
|
||||||
|
|
||||||
### Benchmark sglang
|
### Benchmark sglang
|
||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
python3 bench_sglang.py --num-questions 32 --parallel 8
|
python3 bench_sglang.py --num-questions 32
|
||||||
python3 bench_sglang.py --num-questions 16 --parallel 1
|
python3 bench_sglang.py --num-questions 16 --parallel 1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -141,6 +141,9 @@ def main(args):
|
|||||||
rets.append(out["answer"])
|
rets.append(out["answer"])
|
||||||
return rets
|
return rets
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
call_generate("Hello,", 1.0, 8, ".", 1)
|
||||||
|
|
||||||
# Run requests
|
# Run requests
|
||||||
states = [None] * len(questions)
|
states = [None] * len(questions)
|
||||||
def get_one_answer(i):
|
def get_one_answer(i):
|
||||||
|
|||||||
@@ -1,20 +1,23 @@
|
|||||||
## Flashinfer Mode
|
## Flashinfer Mode
|
||||||
|
|
||||||
[`flashinfer`](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving; we use it here to support our attention computation.
|
[flashinfer](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving.
|
||||||
|
It can be used in SGLang runtime to accelerate attention computation.
|
||||||
|
|
||||||
### Install flashinfer
|
### Install flashinfer
|
||||||
|
|
||||||
|
Note: The compilation can take a very long time.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
pip install 3rdparty/flashinfer/python
|
pip install 3rdparty/flashinfer/python
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run Sever With Flashinfer Mode
|
### Run a Server With Flashinfer Mode
|
||||||
|
|
||||||
Add through `--model_mode` argument from the command line.
|
Add `--model-mode flashinfer` argument to enable flashinfer when launching a server.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --model-mode flashinfer
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --model-mode flashinfer
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -351,7 +351,7 @@ class MixtralForCausalLM(nn.Module):
|
|||||||
|
|
||||||
params_dict = dict(self.named_parameters())
|
params_dict = dict(self.named_parameters())
|
||||||
for name, loaded_weight in hf_model_weights_iterator(
|
for name, loaded_weight in hf_model_weights_iterator(
|
||||||
model_name_or_path, cache_dir, load_format, revision, fall_back_to_pt=False
|
model_name_or_path, cache_dir, load_format, revision
|
||||||
):
|
):
|
||||||
if "rotary_emb.inv_freq" in name:
|
if "rotary_emb.inv_freq" in name:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -93,7 +93,8 @@ class ServerArgs:
|
|||||||
type=str,
|
type=str,
|
||||||
default=[],
|
default=[],
|
||||||
nargs="+",
|
nargs="+",
|
||||||
help="Model mode: [flashinfer, no-cache, aggressive-new-fill]",
|
choices=["flashinfer", "no-cache"],
|
||||||
|
help="Model mode: [flashinfer, no-cache]",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--schedule-heuristic",
|
"--schedule-heuristic",
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ def call_select_vllm(context, choices, url):
|
|||||||
}
|
}
|
||||||
res = requests.post(url, json=data)
|
res = requests.post(url, json=data)
|
||||||
assert res.status_code == 200
|
assert res.status_code == 200
|
||||||
scores.append(res.json()["prompt_score"])
|
scores.append(res.json().get("prompt_score", 0))
|
||||||
return np.argmax(scores)
|
return np.argmax(scores)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -112,7 +112,7 @@ def call_select_vllm(context, choices, url):
|
|||||||
|
|
||||||
|
|
||||||
def add_common_other_args_and_parse(parser):
|
def add_common_other_args_and_parse(parser):
|
||||||
parser.add_argument("--parallel", type=int, default=96)
|
parser.add_argument("--parallel", type=int, default=64)
|
||||||
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
||||||
parser.add_argument("--port", type=int, default=None)
|
parser.add_argument("--port", type=int, default=None)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
6
scripts/launch_tgi.sh
Normal file
6
scripts/launch_tgi.sh
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
docker run --name tgi --rm -ti --gpus all --network host \
|
||||||
|
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:1.3.0 \
|
||||||
|
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
|
||||||
|
--max-input-length 2048 --max-total-tokens 4096 \
|
||||||
|
--port 24000
|
||||||
Reference in New Issue
Block a user