docs: update comment (#721)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer:
|
||||
# config.json from https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5128893
|
||||
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json?download=true
|
||||
# wget wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json?download=true
|
||||
# config.json from ./config.md
|
||||
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
|
||||
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
|
||||
|
||||
# Launch sglang
|
||||
# python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.88
|
||||
@@ -19,10 +19,3 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log33
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log34
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log35
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log36
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 1 > sglang/log41
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 2 > sglang/log42
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 4 > sglang/log43
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 8 > sglang/log44
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 16 > sglang/log45
|
||||
# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 32 > sglang/log46
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Launch trtllm
|
||||
# https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5129302
|
||||
# https://github.com/sgl-project/tensorrt-demo
|
||||
|
||||
# offline
|
||||
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2500 --random-input 1024 --random-output 1024 --random-range-ratio 0.5 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log11
|
||||
@@ -14,10 +14,3 @@ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random
|
||||
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log33
|
||||
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log34
|
||||
python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log35
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log36
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 1 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log41
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 2 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log42
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 4 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log43
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 8 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log44
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 16 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log45
|
||||
# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 32 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log46
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer:
|
||||
# config.json from https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5128893
|
||||
# (remove the new llama3 rope_scaling entry to run with vLLM 0.5.2)
|
||||
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json?download=true
|
||||
# wget wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json?download=true
|
||||
# config.json from ./config.md
|
||||
# remove the new llama3 rope_scaling entry to run with vLLM 0.5.2
|
||||
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
|
||||
# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
|
||||
|
||||
# Launch vllm
|
||||
# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
|
||||
@@ -20,10 +20,3 @@ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name rando
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log33
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log34
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log35
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log36
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 1 > vllm/log41
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 2 > vllm/log42
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 4 > vllm/log43
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 8 > vllm/log44
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 16 > vllm/log45
|
||||
# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 32 > vllm/log46
|
||||
|
||||
100
benchmark/blog_v0_2/config.md
Normal file
100
benchmark/blog_v0_2/config.md
Normal file
@@ -0,0 +1,100 @@
|
||||
### used for TensorRT LLM
|
||||
|
||||
```
|
||||
{
|
||||
"architecture": "LlamaForCausalLM",
|
||||
"dtype": "float16",
|
||||
"logits_dtype": "float32",
|
||||
"vocab_size": 128256,
|
||||
"max_position_embeddings": 8192,
|
||||
"hidden_size": 16384,
|
||||
"num_hidden_layers": 126,
|
||||
"num_attention_heads": 128,
|
||||
"num_key_value_heads": 16,
|
||||
"head_size": 128,
|
||||
"qk_layernorm": false,
|
||||
"hidden_act": "silu",
|
||||
"intermediate_size": 53248,
|
||||
"norm_epsilon": 1e-05,
|
||||
"position_embedding_type": "rope_gpt_neox",
|
||||
"use_parallel_embedding": false,
|
||||
"embedding_sharding_dim": 0,
|
||||
"share_embedding_table": false,
|
||||
"mapping": {
|
||||
"world_size": 8,
|
||||
"tp_size": 8,
|
||||
"pp_size": 1,
|
||||
"gpus_per_node": 8
|
||||
},
|
||||
"quantization": {
|
||||
"quant_algo": "FP8",
|
||||
"kv_cache_quant_algo": null,
|
||||
"group_size": 128,
|
||||
"smoothquant_val": null,
|
||||
"has_zero_point": false,
|
||||
"pre_quant_scale": false,
|
||||
"exclude_modules": [
|
||||
"lm_head"
|
||||
]
|
||||
},
|
||||
"kv_dtype": "float16",
|
||||
"rotary_scaling": null,
|
||||
"residual_mlp": false,
|
||||
"moe_normalization_mode": null,
|
||||
"rotary_base": 500000.0,
|
||||
"moe_num_experts": 0,
|
||||
"moe_top_k": 0,
|
||||
"moe_tp_mode": 2,
|
||||
"attn_bias": false,
|
||||
"disable_weight_only_quant_plugin": false,
|
||||
"mlp_bias": false
|
||||
}
|
||||
```
|
||||
|
||||
### used for vLLM and SGLang
|
||||
|
||||
```
|
||||
{
|
||||
"_name_or_path": "dummy_fp8",
|
||||
"architectures": [
|
||||
"LlamaForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 128000,
|
||||
"eos_token_id": 128009,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 16384,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 53248,
|
||||
"mlp_bias": false,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 128,
|
||||
"num_hidden_layers": 126,
|
||||
"num_key_value_heads": 8,
|
||||
"pretraining_tp": 1,
|
||||
"quantization_config": {
|
||||
"activation_scheme": "static",
|
||||
"ignored_layers": [
|
||||
"lm_head"
|
||||
],
|
||||
"quant_method": "fp8"
|
||||
},
|
||||
"rope_scaling": {
|
||||
"factor": 8.0,
|
||||
"low_freq_factor": 1.0,
|
||||
"high_freq_factor": 4.0,
|
||||
"original_max_position_embeddings": 8192,
|
||||
"rope_type": "llama3"
|
||||
},
|
||||
"max_position_embeddings": 131072,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_scaling": null,
|
||||
"rope_theta": 500000.0,
|
||||
"tie_word_embeddings": false,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.41.1",
|
||||
"use_cache": true,
|
||||
"vocab_size": 128256
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user