diff --git a/benchmark/blog_v0_2/405b_sglang.sh b/benchmark/blog_v0_2/405b_sglang.sh index 63f906212..eff520c55 100644 --- a/benchmark/blog_v0_2/405b_sglang.sh +++ b/benchmark/blog_v0_2/405b_sglang.sh @@ -1,7 +1,7 @@ # create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer: -# config.json from https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5128893 -# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json?download=true -# wget wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json?download=true +# config.json from ./config.md +# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json +# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json # Launch sglang # python -m sglang.launch_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.88 @@ -19,10 +19,3 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log33 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log34 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log35 -# python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > sglang/log36 -# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 1 > sglang/log41 -# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 2 > sglang/log42 -# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 4 > sglang/log43 -# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 8 > sglang/log44 -# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 16 > sglang/log45 -# python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 1000 --request-rate 32 > sglang/log46 diff --git a/benchmark/blog_v0_2/405b_trt.sh b/benchmark/blog_v0_2/405b_trt.sh index f8252a800..972865c85 100644 --- a/benchmark/blog_v0_2/405b_trt.sh +++ b/benchmark/blog_v0_2/405b_trt.sh @@ -1,5 +1,5 @@ # Launch trtllm -# https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5129302 +# https://github.com/sgl-project/tensorrt-demo # offline python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2500 --random-input 1024 --random-output 1024 --random-range-ratio 0.5 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log11 @@ -14,10 +14,3 @@ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log33 python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log34 python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log35 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log36 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 1 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log41 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 2 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log42 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 4 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log43 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 8 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log44 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 16 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log45 -# python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 1000 --request-rate 32 --model meta-llama/Meta-Llama-3-8B-Instruct > trtllm/log46 diff --git a/benchmark/blog_v0_2/405b_vllm.sh b/benchmark/blog_v0_2/405b_vllm.sh index b212bba7a..190fb5fde 100644 --- a/benchmark/blog_v0_2/405b_vllm.sh +++ b/benchmark/blog_v0_2/405b_vllm.sh @@ -1,8 +1,8 @@ # create ~/llama-3.1-405b-fp8-dummy and create config.json and tokenizer: -# config.json from https://gist.github.com/zhyncs/748597c44d47b45fa15866a4ae2c2b29?permalink_comment_id=5128893 -# (remove the new llama3 rope_scaling entry to run with vLLM 0.5.2) -# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json?download=true -# wget wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json?download=true +# config.json from ./config.md +# remove the new llama3 rope_scaling entry to run with vLLM 0.5.2 +# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json +# wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json # Launch vllm # python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000 @@ -20,10 +20,3 @@ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name rando python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log33 python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log34 python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log35 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1000 --request-rate 32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 > vllm/log36 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 1 > vllm/log41 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 2 > vllm/log42 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 4 > vllm/log43 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 8 > vllm/log44 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 16 > vllm/log45 -# python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 1000 --request-rate 32 > vllm/log46 diff --git a/benchmark/blog_v0_2/config.md b/benchmark/blog_v0_2/config.md new file mode 100644 index 000000000..3faf6009b --- /dev/null +++ b/benchmark/blog_v0_2/config.md @@ -0,0 +1,100 @@ +### used for TensorRT LLM + +``` +{ + "architecture": "LlamaForCausalLM", + "dtype": "float16", + "logits_dtype": "float32", + "vocab_size": 128256, + "max_position_embeddings": 8192, + "hidden_size": 16384, + "num_hidden_layers": 126, + "num_attention_heads": 128, + "num_key_value_heads": 16, + "head_size": 128, + "qk_layernorm": false, + "hidden_act": "silu", + "intermediate_size": 53248, + "norm_epsilon": 1e-05, + "position_embedding_type": "rope_gpt_neox", + "use_parallel_embedding": false, + "embedding_sharding_dim": 0, + "share_embedding_table": false, + "mapping": { + "world_size": 8, + "tp_size": 8, + "pp_size": 1, + "gpus_per_node": 8 + }, + "quantization": { + "quant_algo": "FP8", + "kv_cache_quant_algo": null, + "group_size": 128, + "smoothquant_val": null, + "has_zero_point": false, + "pre_quant_scale": false, + "exclude_modules": [ + "lm_head" + ] + }, + "kv_dtype": "float16", + "rotary_scaling": null, + "residual_mlp": false, + "moe_normalization_mode": null, + "rotary_base": 500000.0, + "moe_num_experts": 0, + "moe_top_k": 0, + "moe_tp_mode": 2, + "attn_bias": false, + "disable_weight_only_quant_plugin": false, + "mlp_bias": false +} +``` + +### used for vLLM and SGLang + +``` +{ + "_name_or_path": "dummy_fp8", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "hidden_act": "silu", + "hidden_size": 16384, + "initializer_range": 0.02, + "intermediate_size": 53248, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 128, + "num_hidden_layers": 126, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "activation_scheme": "static", + "ignored_layers": [ + "lm_head" + ], + "quant_method": "fp8" + }, + "rope_scaling": { + "factor": 8.0, + "low_freq_factor": 1.0, + "high_freq_factor": 4.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "max_position_embeddings": 131072, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.41.1", + "use_cache": true, + "vocab_size": 128256 +} +```