diff --git a/.DS_Store b/.DS_Store index 029e67f..df710b9 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/vllm-v0.6.2/.buildkite/check-wheel-size.py b/vllm-v0.6.2/.buildkite/check-wheel-size.py new file mode 100644 index 0000000..0412c5f --- /dev/null +++ b/vllm-v0.6.2/.buildkite/check-wheel-size.py @@ -0,0 +1,43 @@ +import os +import sys +import zipfile + +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) + + +def print_top_10_largest_files(zip_file): + """Print the top 10 largest files in the given zip file.""" + with zipfile.ZipFile(zip_file, 'r') as z: + file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] + file_sizes.sort(key=lambda x: x[1], reverse=True) + for f, size in file_sizes[:10]: + print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.") + + +def check_wheel_size(directory): + """Check the size of .whl files in the given directory.""" + for root, _, files in os.walk(directory): + for file_name in files: + if file_name.endswith(".whl"): + wheel_path = os.path.join(root, file_name) + wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) + if wheel_size_mb > VLLM_MAX_SIZE_MB: + print(f"Not allowed: Wheel {wheel_path} is larger " + f"({wheel_size_mb:.2f} MB) than the limit " + f"({VLLM_MAX_SIZE_MB} MB).") + print_top_10_largest_files(wheel_path) + return 1 + else: + print(f"Wheel {wheel_path} is within the allowed size " + f"({wheel_size_mb:.2f} MB).") + return 0 + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python check-wheel-size.py ") + sys.exit(1) + + directory = sys.argv[1] + sys.exit(check_wheel_size(directory)) \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml new file mode 100644 index 0000000..d70ecb2 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml @@ -0,0 +1,12 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 +model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.671 + - name: "exact_match,flexible-extract" + value: 0.664 +limit: 1000 +num_fewshot: 5 +trust_remote_code: True \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml new file mode 100644 index 0000000..4397eff --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 +model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.905 + - name: "exact_match,flexible-extract" + value: 0.905 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000..fa6ea23 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.892 + - name: "exact_match,flexible-extract" + value: 0.892 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml new file mode 100644 index 0000000..c513159 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.752 + - name: "exact_match,flexible-extract" + value: 0.754 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml new file mode 100644 index 0000000..5e57fcb --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.753 + - name: "exact_match,flexible-extract" + value: 0.753 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml new file mode 100644 index 0000000..374171f --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.755 + - name: "exact_match,flexible-extract" + value: 0.755 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml new file mode 100644 index 0000000..dc36b70 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.753 + - name: "exact_match,flexible-extract" + value: 0.753 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml new file mode 100644 index 0000000..0ecfc01 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.764 + - name: "exact_match,flexible-extract" + value: 0.764 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000..bc29002 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.728 + - name: "exact_match,flexible-extract" + value: 0.728 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml new file mode 100644 index 0000000..3964f3b --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.758 + - name: "exact_match,flexible-extract" + value: 0.759 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000..fb4b491 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 +model_name: "meta-llama/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml new file mode 100644 index 0000000..0424586 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 +model_name: "HandH1998/QQQ-Llama-3-8b-g128" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.419 + - name: "exact_match,flexible-extract" + value: 0.416 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000..78347f6 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 +model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.356 + - name: "exact_match,flexible-extract" + value: 0.358 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml new file mode 100644 index 0000000..3ea0b7b --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 +model_name: "mgoin/Minitron-4B-Base-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.233 + - name: "exact_match,flexible-extract" + value: 0.236 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml new file mode 100644 index 0000000..75a24e4 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 +model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.86 + - name: "exact_match,flexible-extract" + value: 0.86 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml new file mode 100644 index 0000000..436ec21 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 +model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.624 + - name: "exact_match,flexible-extract" + value: 0.624 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml new file mode 100644 index 0000000..dec9164 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.632 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml new file mode 100644 index 0000000..42936fb --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.578 + - name: "exact_match,flexible-extract" + value: 0.585 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000..43ff2bc --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 +model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.593 + - name: "exact_match,flexible-extract" + value: 0.588 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml new file mode 100644 index 0000000..259799b --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 +model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.595 + - name: "exact_match,flexible-extract" + value: 0.582 +limit: 1000 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml new file mode 100644 index 0000000..45d5efc --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 +model_name: "Qwen/Qwen2-57B-A14B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.792 + - name: "exact_match,flexible-extract" + value: 0.824 +limit: 250 +num_fewshot: 5 diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/models-large.txt b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/models-large.txt new file mode 100644 index 0000000..37eeac8 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/models-large.txt @@ -0,0 +1,5 @@ +Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x7B-Instruct-v0.1.yaml +Qwen2-57B-A14-Instruct.yaml +DeepSeek-V2-Lite-Chat.yaml diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/models-small.txt b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/models-small.txt new file mode 100644 index 0000000..6057229 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/configs/models-small.txt @@ -0,0 +1,10 @@ +Meta-Llama-3-8B-Instruct.yaml +Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml +Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml +Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml +Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml +Minitron-4B-Base-FP8.yaml +Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml +Qwen2-1.5B-Instruct-FP8W8.yaml +Meta-Llama-3-8B-QQQ.yaml diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/vllm-v0.6.2/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh new file mode 100644 index 0000000..a67fc89 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.4 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo +} + +while getopts "m:b:l:f:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model hf \ + --model_args "pretrained=$MODEL,parallelize=True" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/vllm-v0.6.2/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000..65be3c5 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.4 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/run-tests.sh b/vllm-v0.6.2/.buildkite/lm-eval-harness/run-tests.sh new file mode 100644 index 0000000..26f33b7 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/run-tests.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export LM_EVAL_TP_SIZE=$TP_SIZE + pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" + else + echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/vllm-v0.6.2/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/vllm-v0.6.2/.buildkite/lm-eval-harness/test_lm_eval_correctness.py new file mode 100644 index 0000000..afc935c --- /dev/null +++ b/vllm-v0.6.2/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -0,0 +1,63 @@ +""" +LM eval harness on model to compare vs HF baseline computed offline. +Configs are found in configs/$MODEL.yaml + +* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml +* export LM_EVAL_TP_SIZE=4 +* pytest -s test_lm_eval_correctness.py +""" + +import os +from pathlib import Path + +import lm_eval +import numpy +import yaml + +RTOL = 0.05 +TEST_DATA_FILE = os.environ.get( + "LM_EVAL_TEST_DATA_FILE", + ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") + +TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) + + +def launch_lm_eval(eval_config): + trust_remote_code = eval_config.get('trust_remote_code', False) + + model_args = f"pretrained={eval_config['model_name']}," \ + f"tensor_parallel_size={TP_SIZE}," \ + f"add_bos_token=true," \ + f"trust_remote_code={trust_remote_code}" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + batch_size="auto") + + return results + + +def test_lm_eval_correctness(): + eval_config = yaml.safe_load( + Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + # Launch eval requests. + results = launch_lm_eval(eval_config) + + # Confirm scores match ground truth. + success = True + for task in eval_config["tasks"]: + for metric in task["metrics"]: + ground_truth = metric["value"] + measured_value = results["results"][task["name"]][metric["name"]] + print(f'{task["name"]} | {metric["name"]}: ' + f'ground_truth={ground_truth} | measured={measured_value}') + success = success and numpy.isclose( + ground_truth, measured_value, rtol=RTOL) + + # Assert at the end, print all scores even on failure for debugging. + assert success diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/README.md b/vllm-v0.6.2/.buildkite/nightly-benchmarks/README.md new file mode 100644 index 0000000..fbf41eb --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/README.md @@ -0,0 +1,153 @@ +# vLLM benchmark suite + + +## Introduction + +This directory contains two sets of benchmark for vllm. +- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance +- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. + + +See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. + + +## Performance benchmark quick overview + +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models. + +**Benchmarking Duration**: about 1hr. + +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. + + +## Nightly benchmark quick overview + +**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. + +**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. + +**Benchmarking Duration**: about 3.5hrs. + + + +## Trigger the benchmark + +Performance benchmark will be triggered when: +- A PR being merged into vllm. +- Every commit for those PRs with `perf-benchmarks` label AND `ready` label. + +Nightly benchmark will be triggered when: +- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. + + + + +## Performance benchmark details + + +See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. + + +#### Latency test + +Here is an example of one test inside `latency-tests.json`: + +```json +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, +] +``` + +In this example: +- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. +- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` + +Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. + + +#### Throughput test +The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. + +The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. + +#### Serving test +We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: + +``` +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, +] +``` + +Inside this example: +- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. +- The `server-parameters` includes the command line arguments for vLLM server. +- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. +- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` + +The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. + +#### Visualizing the results +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. +If you do not see the table, please wait till the benchmark finish running. +The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. +The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. + + + +## Nightly test details + +See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. + + +#### Workflow + +- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. +- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. +- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. +- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. + +#### Nightly tests + +In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. + +#### Docker containers + +The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. + +WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. + +WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). + diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/vllm-v0.6.2/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml new file mode 100644 index 0000000..eec2a51 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -0,0 +1,60 @@ +steps: + - label: "Wait for container to be ready" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: badouralix/curl-jq + command: + - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh + - wait + - label: "A100" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + # - label: "H100" + # agents: + # queue: H100 + # plugins: + # - docker#v5.11.0: + # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + # command: + # - bash + # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + # mount-buildkite-agent: true + # propagate-environment: true + # ipc: host + # gpus: all + # environment: + # - VLLM_USAGE_SOURCE + # - HF_TOKEN + diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-annotation.md b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-annotation.md new file mode 100644 index 0000000..1e33793 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -0,0 +1,28 @@ + +## Description + +This file contains the downloading link for benchmarking results. + +- [benchmarking pipeline](artifact://nightly-pipeline.yaml) +- [benchmarking results](artifact://results.zip) +- [benchmarking code](artifact://nightly-benchmarks.zip) + +Please download the visualization scripts in the post + + +## Results reproduction + +- Find the docker we use in `benchmarking pipeline` +- Deploy the docker, and inside the docker: + - Download `nightly-benchmarks.zip`. + - In the same folder, run the following code +``` +export HF_TOKEN= +apt update +apt install -y git +unzip nightly-benchmarks.zip +VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +``` + +And the results will be inside `./benchmarks/results`. + diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-descriptions.md b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-descriptions.md new file mode 100644 index 0000000..7dec7a0 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -0,0 +1,39 @@ + +# Nightly benchmark + +This benchmark aims to: +- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. +- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. + +Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. + +Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) + + +## Setup + +- Docker images: + - vLLM: `vllm/vllm-openai:v0.6.2` + - SGLang: `lmsysorg/sglang:v0.3.2-cu121` + - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` + - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` + - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* + - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. +- Hardware + - 8x Nvidia A100 GPUs +- Workload: + - Dataset + - ShareGPT dataset + - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) + - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) + - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. + - Models: llama-3 8B, llama-3 70B. + - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). + - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. + - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. + - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). + +# Known issues + +- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). +- TGI does not support `ignore-eos` flag. \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-pipeline.yaml new file mode 100644 index 0000000..199517e --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -0,0 +1,196 @@ +common_pod_spec: &common_pod_spec + priorityClassName: perf-benchmark + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + +common_container_settings: &common_container_settings + command: + - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + +steps: + - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." + + + + - label: "A100 vllm step 10" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: vllm/vllm-openai:v0.6.2 + <<: *common_container_settings + + + + - label: "A100 sglang benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: lmsysorg/sglang:v0.3.2-cu121 + <<: *common_container_settings + + - label: "A100 lmdeploy benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: openmmlab/lmdeploy:v0.6.1-cu12 + <<: *common_container_settings + + + + + - label: "A100 trt llama-8B" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 + <<: *common_container_settings + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: TEST_SELECTOR + value: "llama8B" + + + - label: "A100 trt llama-70B" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 + <<: *common_container_settings + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: TEST_SELECTOR + value: "llama70B" + + + # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image + # - label: "A100 trt benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # <<: *common_pod_spec + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 + # <<: *common_container_settings + + + # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. + # - label: "A100 tgi benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # <<: *common_pod_spec + # containers: + # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 + # <<: *common_container_settings + + - wait + + - label: "Collect the results" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: vllm/vllm-openai:v0.5.0.post1 + command: + - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + + - block: ":rocket: check the results!" \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/vllm-v0.6.2/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md new file mode 100644 index 0000000..da32d1f --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -0,0 +1,62 @@ + +## Latency tests + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: end-to-end latency (mean, median, p99). + + +{latency_tests_markdown_table} + + +## Throughput tests + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput. + + +{throughput_tests_markdown_table} + + +## Serving tests + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- We also added a speculative decoding test for llama-3 70B, under QPS 2 +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + + +{serving_tests_markdown_table} + + +## json version of the benchmarking tables + +This section contains the data of the markdown tables above in JSON format. +You can load the benchmarking tables into pandas dataframes as follows: + +```python +import json +import pandas as pd + +benchmarking_results_json = """The json string""" +benchmarking_results = json.loads(benchmarking_results_json) +latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) +throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) +serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) +``` + +The json string for all benchmarking tables: +```json +{benchmarking_results_in_json_string} +``` + +You can also check the raw experiment data in the Artifact tab of the Buildkite page. + diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py new file mode 100644 index 0000000..7cf0561 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -0,0 +1,192 @@ +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +results_folder = Path("results/") + +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "avg_latency": "Mean latency (ms)", + # "P10": "P10 (s)", + # "P25": "P25 (s)", + "P50": "Median latency (ms)", + # "P75": "P75 (s)", + # "P90": "P90 (s)", + "P99": "P99 latency (ms)", +} + +# throughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + # "num_requests": "# of req.", + # "total_num_tokens": "Total # of tokens", + # "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + # "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + # "completed": "# of req.", + "request_throughput": "Tput (req/s)", + # "input_throughput": "Input Tput (tok/s)", + # "output_throughput": "Output Tput (tok/s)", + "mean_ttft_ms": "Mean TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", + "p99_ttft_ms": "P99 TTFT (ms)", + # "mean_tpot_ms": "Mean TPOT (ms)", + # "median_tpot_ms": "Median", + # "p99_tpot_ms": "P99", + "mean_itl_ms": "Mean ITL (ms)", + "median_itl_ms": "Median ITL (ms)", + "p99_itl_ms": "P99 ITL (ms)", +} + + +def read_markdown(file): + if os.path.exists(file): + with open(file) as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps({ + 'latency': latency.to_dict(), + 'throughput': throughput.to_dict(), + 'serving': serving.to_dict() + }) + + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file) as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json(latency_results, throughput_results, + serving_results) + + # remapping the key, for visualization purpose + if not latency_results.empty: + latency_results = latency_results[list( + latency_column_mapping.keys())].rename( + columns=latency_column_mapping) + if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + + processed_results_json = results_to_json(latency_results, + throughput_results, + serving_results) + + # get markdown tables + latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + + # document the result + with open(results_folder / "benchmark_results.md", "w") as f: + + results = read_markdown("../.buildkite/nightly-benchmarks/" + + "performance-benchmarks-descriptions.md") + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json) + f.write(results) + + # document benchmarking results in json + with open(results_folder / "benchmark_results.json", "w") as f: + + results = latency_results.to_dict( + orient='records') + throughput_results.to_dict( + orient='records') + serving_results.to_dict(orient='records') + f.write(json.dumps(results)) diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py new file mode 100644 index 0000000..68ac590 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -0,0 +1,26 @@ +import argparse + +from transformers import AutoTokenizer + + +def main(model, cachedir): + # Load the tokenizer and save it to the specified directory + tokenizer = AutoTokenizer.from_pretrained(model) + tokenizer.save_pretrained(cachedir) + print(f"Tokenizer saved to {cachedir}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Download and save Hugging Face tokenizer") + parser.add_argument("--model", + type=str, + required=True, + help="Name of the model") + parser.add_argument("--cachedir", + type=str, + required=True, + help="Directory to save the tokenizer") + + args = parser.parse_args() + main(args.model, args.cachedir) diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py new file mode 100644 index 0000000..052060c --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -0,0 +1,95 @@ +import argparse +import json +from pathlib import Path + +import numpy as np +import pandas as pd +from tabulate import tabulate + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description= + 'Parse command line arguments for summary-nightly-results script.') + parser.add_argument('--results-folder', + type=str, + required=True, + help='The folder where the results are stored.') + parser.add_argument('--description', + type=str, + required=True, + help='Description of the results.') + + args = parser.parse_args() + return args + + +def get_perf(df, method, model, metric): + + means = [] + + for qps in [2, 4, 8, 16, "inf"]: + target = df['Test name'].str.contains(model) + target = target & df['Engine'].str.contains(method) + target = target & df['Test name'].str.contains("qps_" + str(qps)) + filtered_df = df[target] + + if filtered_df.empty: + means.append(0.) + else: + means.append(filtered_df[metric].values[0]) + + return np.array(means) + + +def get_perf_w_std(df, method, model, metric): + + if metric in ["TTFT", "ITL"]: + mean = get_perf(df, method, model, "Mean " + metric + " (ms)") + mean = mean.tolist() + std = get_perf(df, method, model, "Std " + metric + " (ms)") + if std.mean() == 0: + std = None + success = get_perf(df, method, model, "Successful req.") + if std is not None: + std = std / np.sqrt(success) + std = std.tolist() + + else: + assert metric == "Tput" + mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( + df, method, model, "Output Tput (tok/s)") + mean = mean.tolist() + std = None + + return mean, std + + +def main(args): + results_folder = Path(args.results_folder) + + results = [] + + # collect results + for test_file in results_folder.glob("*_nightly_results.json"): + with open(test_file) as f: + results = results + json.loads(f.read()) + + # generate markdown table + df = pd.DataFrame.from_dict(results) + + md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) + + with open(args.description) as f: + description = f.read() + + description = description.format( + nightly_results_benchmarking_table=md_table) + + with open("nightly_results.md", "w") as f: + f.write(description) + + +if __name__ == '__main__': + args = parse_arguments() + main(args) diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py new file mode 100644 index 0000000..18bcc3a --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -0,0 +1,6 @@ +from lmdeploy.serve.openai.api_client import APIClient + +api_client = APIClient("http://localhost:8000") +model_name = api_client.available_models[0] + +print(model_name) diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/launch-server.sh new file mode 100644 index 0000000..fb5063d --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -0,0 +1,228 @@ +#!/bin/bash + +# Currently FP8 benchmark is NOT enabled. + +set -x +server_params=$1 +common_params=$2 + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +launch_trt_server() { + + model_path=$(echo "$common_params" | jq -r '.model') + model_name="${model_path#*/}" + model_type=$(echo "$server_params" | jq -r '.model_type') + model_dtype=$(echo "$server_params" | jq -r '.model_dtype') + model_tp_size=$(echo "$common_params" | jq -r '.tp') + max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') + max_input_len=$(echo "$server_params" | jq -r '.max_input_len') + max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len') + max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens') + trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') + + # create model caching directory + cd ~ + rm -rf models + mkdir -p models + cd models + models_dir=$(pwd) + trt_model_path=${models_dir}/${model_name}-trt-ckpt + trt_engine_path=${models_dir}/${model_name}-trt-engine + + # clone tensorrt backend + cd / + rm -rf tensorrtllm_backend + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git + git lfs install + cd tensorrtllm_backend + git checkout "$trt_llm_version" + git submodule update --init --recursive + + # build trtllm engine + cd /tensorrtllm_backend + cd "./tensorrt_llm/examples/${model_type}" + python3 convert_checkpoint.py \ + --model_dir "${model_path}" \ + --dtype "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --output_dir "${trt_model_path}" + trtllm-build \ + --checkpoint_dir "${trt_model_path}" \ + --use_fused_mlp \ + --reduce_fusion disable \ + --workers 8 \ + --gpt_attention_plugin "${model_dtype}" \ + --gemm_plugin "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --max_batch_size "${max_batch_size}" \ + --max_input_len "${max_input_len}" \ + --max_seq_len "${max_seq_len}" \ + --max_num_tokens "${max_num_tokens}" \ + --output_dir "${trt_engine_path}" + + # handle triton protobuf files and launch triton server + cd /tensorrtllm_backend + mkdir triton_model_repo + cp -r all_models/inflight_batcher_llm/* triton_model_repo/ + cd triton_model_repo + rm -rf ./tensorrt_llm/1/* + cp -r "${trt_engine_path}"/* ./tensorrt_llm/1 + python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false + python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5" + python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false" + python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size" + python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1" + cd /tensorrtllm_backend + python3 scripts/launch_triton_server.py \ + --world_size="${model_tp_size}" \ + --model_repo=/tensorrtllm_backend/triton_model_repo & + +} + +launch_tgi_server() { + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + port=$(echo "$common_params" | jq -r '.port') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + --quantize fp8 \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + $server_args" + fi + + echo "Server command: $server_command" + eval "$server_command" & + +} + +launch_lmdeploy_server() { + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + port=$(echo "$common_params" | jq -r '.port') + server_args=$(json2args "$server_params") + + server_command="lmdeploy serve api_server $model \ + --tp $tp \ + --server-port $port \ + $server_args" + + # run the server + echo "Server command: $server_command" + bash -c "$server_command" & +} + +launch_sglang_server() { + + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + port=$(echo "$common_params" | jq -r '.port') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') + server_command="python3 \ + -m sglang.launch_server \ + --tp $tp \ + --model-path $model \ + --port $port \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m sglang.launch_server \ + --tp $tp \ + --model-path $model \ + --port $port \ + $server_args" + fi + + # run the server + echo "Server command: $server_command" + eval "$server_command" & +} + +launch_vllm_server() { + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + port=$(echo "$common_params" | jq -r '.port') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + fi + + # run the server + echo "Server command: $server_command" + eval "$server_command" & +} + +main() { + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then + launch_trt_server + fi + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then + launch_tgi_server + fi + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then + launch_lmdeploy_server + fi + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then + launch_sglang_server + fi + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then + launch_vllm_server + fi +} + +main diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh new file mode 100644 index 0000000..686f70d --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +set -ex +set -o pipefail + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + (which zip) || (apt-get install -y zip) + + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip plotting the results." + exit 0 + fi + + # initial annotation + #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" + + # download results + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" + mkdir -p results/ + /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ + ls + ls results/ + + # upload benchmark results + zip -r results.zip results/ + /workspace/buildkite-agent artifact upload "results.zip" + + # upload benchmarking scripts + cd "$VLLM_SOURCE_CODE_LOC/" + zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ + /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" + + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" + # upload benchmarking pipeline + /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" + + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" + /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md + + + + # The figures should be genereated by a separate process outside the CI/CD pipeline + + # # generate figures + # python3 -m pip install tabulate pandas matplotlib + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ + # --description $description \ + # --results-folder results/ + + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sharegpt + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sonnet_2048_128 + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sonnet_128_2048 + + # # upload results and figures + # /workspace/buildkite-agent artifact upload "nightly_results*.png" + # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml + # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json + # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md +} + +main "$@" diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh new file mode 100644 index 0000000..3f38cf5 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -0,0 +1,355 @@ +#!/bin/bash + +set -o pipefail +set -x + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + + +get_current_llm_serving_engine() { + + if which lmdeploy >/dev/null; then + echo "Container: lmdeploy" + export CURRENT_LLM_SERVING_ENGINE=lmdeploy + return + fi + + if [ -e /tgi-entrypoint.sh ]; then + echo "Container: tgi" + export CURRENT_LLM_SERVING_ENGINE=tgi + return + fi + + if which trtllm-build >/dev/null; then + echo "Container: tensorrt-llm" + export CURRENT_LLM_SERVING_ENGINE=trt + return + fi + + if [ -e /sgl-workspace ]; then + echo "Container: sglang" + export CURRENT_LLM_SERVING_ENGINE=sglang + return + fi + + if [ -e /vllm-workspace ]; then + echo "Container: vllm" + # move to a completely irrelevant directory, to avoid import vllm from current folder + export CURRENT_LLM_SERVING_ENGINE=vllm + + return + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +kill_gpu_processes() { + pkill -f python + pkill -f python3 + pkill -f tritonserver + pkill -f pt_main_thread + pkill -f text-generation + pkill -f lmdeploy + + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do + sleep 1 + done +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl -s localhost:8000/v1/completions > /dev/null; do + sleep 1 + done' && return 0 || return 1 +} + +ensure_installed() { + # Ensure that the given command is installed by apt-get + local cmd=$1 + if ! which "$cmd" >/dev/null; then + apt-get update && apt-get install -y "$cmd" + fi +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ + "$server_params" "$common_params" + fi + + if wait_for_server; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # prepare tokenizer + # this is required for lmdeploy. + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" + rm -rf /tokenizer_cache + mkdir /tokenizer_cache + python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ + --model "$model" \ + --cachedir /tokenizer_cache + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" + + + # change model name for lmdeploy (it will not follow standard hf name) + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then + model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ $backend = "trt" ]]; then + backend="tensorrt-llm" + fi + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + + if [[ "$dataset_name" = "sharegpt" ]]; then + + client_command="python3 benchmark_serving.py \ + --backend $backend \ + --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --ignore-eos \ + $client_args" + + elif [[ "$dataset_name" = "sonnet" ]]; then + + sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') + sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') + sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') + + client_command="python3 benchmark_serving.py \ + --backend $backend \ + --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --sonnet-input-len $sonnet_input_len \ + --sonnet-output-len $sonnet_output_len \ + --sonnet-prefix-len $sonnet_prefix_len \ + --port $port \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --ignore-eos \ + $client_args" + + else + + echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." + exit 1 + + fi + + + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + server_command="None" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + done + + kill_gpu_processes +} + + +prepare_dataset() { + + # download sharegpt dataset + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + # duplicate sonnet by 4x, to allow benchmarking with input length 2048 + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + +} + +main() { + + # check if the environment variable is successfully injected from yaml + + check_gpus + check_hf_token + get_current_llm_serving_engine + + pip install -U transformers + + # check storage + df -h + + ensure_installed wget + ensure_installed curl + ensure_installed jq + + prepare_dataset + + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" + + # run the test + run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" + + # upload benchmark results to buildkite + python3 -m pip install tabulate pandas + python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" + upload_to_buildkite + +} + +main "$@" diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh new file mode 100644 index 0000000..d397b05 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# This script should be run inside the CI process +# This script assumes that we are already inside the vllm/ directory +# Benchmarking results will be available inside vllm/benchmarks/results/ + +# Do not set -e, as the mixtral 8x22B model tends to crash occasionally +# and we still want to see other benchmarking results even when mixtral crashes. +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + +ensure_sharegpt_downloaded() { + local FILE=ShareGPT_V3_unfiltered_cleaned_split.json + if [ ! -f "$FILE" ]; then + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE + else + echo "$FILE already exists." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl -X POST localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +kill_processes_launched_by_current_bash() { + # Kill all python processes launched from current bash script + current_shell_pid=$$ + processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}') + if [ -n "$processes" ]; then + echo "Killing the following processes matching '$1':" + echo "$processes" + echo "$processes" | xargs kill -9 + else + echo "No processes found matching '$1'." + fi +} + +kill_gpu_processes() { + + ps -aux + lsof -t -i:8000 | xargs -r kill -9 + pkill -f pt_main_thread + # this line doesn't work now + # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9 + pkill -f python3 + pkill -f /usr/bin/python3 + + + # wait until GPU memory usage smaller than 1GB + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do + sleep 1 + done + + # remove vllm config file + rm -rf ~/.config/vllm + +} + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent + if command -v buildkite-agent >/dev/null 2>&1; then + BUILDKITE_AGENT_COMMAND="buildkite-agent" + elif [ -f /workspace/buildkite-agent ]; then + BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent" + else + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + + # Use the determined command to annotate and upload artifacts + $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" + $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" +} + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + # check if there is enough GPU to run the test + tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + latency_command="python3 benchmark_latency.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg latency "$latency_command" \ + --arg gpu "$gpu_type" \ + '{ + latency_command: $latency, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$latency_command" + + kill_gpu_processes + + done +} + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + # check if there is enough GPU to run the test + tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + throughput_command="python3 benchmark_throughput.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg command "$throughput_command" \ + --arg gpu "$gpu_type" \ + '{ + throughput_command: $command, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$throughput_command" + + kill_gpu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $test_name." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + server_pid=$! + + # wait until the server is alive + if wait_for_server; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill -9 $server_pid + kill_gpu_processes + done +} + +main() { + check_gpus + check_hf_token + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + (which lsof) || (apt-get update && apt-get install -y lsof) + + # get the current IP address, required by benchmark_serving.py + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # prepare for benchmarking + cd benchmarks || exit 1 + ensure_sharegpt_downloaded + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + # benchmarking + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + + # postprocess benchmarking results + pip install tabulate pandas + python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py + + upload_to_buildkite +} + +main "$@" diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py new file mode 100644 index 0000000..92d6fad --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -0,0 +1,83 @@ +import datetime +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +results_folder = Path("results/") + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "completed": "Successful req.", + "request_throughput": "Tput (req/s)", + "mean_ttft_ms": "Mean TTFT (ms)", + "std_ttft_ms": "Std TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", + "mean_itl_ms": "Mean ITL (ms)", + "std_itl_ms": "Std ITL (ms)", + "median_itl_ms": "Median ITL (ms)", + "mean_tpot_ms": "Mean TPOT (ms)", + "std_tpot_ms": "Std TPOT (ms)", + "median_tpot_ms": "Median TPOT (ms)", + "total_token_throughput": "Total Token Tput (tok/s)", + "output_throughput": "Output Tput (tok/s)", + "total_input_tokens": "Total input tokens", + "total_output_tokens": "Total output tokens", + "engine": "Engine", +} + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file) as f: + raw_result = json.loads(f.read()) + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + serving_results = pd.DataFrame.from_dict(serving_results) + + if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + + serving_md_table_with_headers = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + # remove the first line of header + serving_md_table_lines = serving_md_table_with_headers.split('\n') + serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) + + prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") + + # document benchmarking results in markdown + with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: + # document results with header. + # for those who wants to reproduce our benchmark. + f.write(serving_md_table_with_headers) + f.write('\n') + + # document benchmarking results in json + with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: + + results = serving_results.to_dict(orient='records') + f.write(json.dumps(results)) diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh new file mode 100644 index 0000000..19f7160 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -0,0 +1,19 @@ +#!/bin/sh +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" + +TIMEOUT_SECONDS=10 + +retries=0 +while [ $retries -lt 1000 ]; do + if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then + exit 0 + fi + + echo "Waiting for image to be available..." + + retries=$((retries + 1)) + sleep 5 +done + +exit 1 diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/latency-tests.json b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/latency-tests.json new file mode 100644 index 0000000..1841186 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + } +] \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/nightly-tests.json new file mode 100644 index 0000000..fda1a7a --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -0,0 +1,323 @@ +[ + { + "test_name": "llama8B_tp1_sharegpt", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000, + "reuse_server": false + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sharegpt", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000, + "reuse_server": false + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + } +] \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/serving-tests.json b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/serving-tests.json new file mode 100644 index 0000000..facb0ea --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -0,0 +1,80 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt_specdecode", + "qps_list": [2], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "disable_log_requests": "", + "tensor_parallel_size": 4, + "swap_space": 16, + "speculative_model": "turboderp/Qwama-0.5B-Instruct", + "num_speculative_tokens": 4, + "speculative_draft_tensor_parallel_size": 1, + "use_v2_block_manager": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/throughput-tests.json new file mode 100644 index 0000000..91ef6d1 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -0,0 +1,35 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/release-pipeline.yaml b/vllm-v0.6.2/.buildkite/release-pipeline.yaml new file mode 100644 index 0000000..f78e360 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/release-pipeline.yaml @@ -0,0 +1,28 @@ +steps: + - label: "Build wheel - CUDA 12.1" + agents: + queue: cpu_queue + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + + # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. + # However, this block can be uncommented to save some compute hours. + # - block: "Build CUDA 11.8 wheel" + # key: block-build-cu118-wheel + + - label: "Build wheel - CUDA 11.8" + # depends_on: block-build-cu118-wheel + agents: + queue: cpu_queue + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" diff --git a/vllm-v0.6.2/.buildkite/run-amd-test.sh b/vllm-v0.6.2/.buildkite/run-amd-test.sh new file mode 100755 index 0000000..902e162 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-amd-test.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +# This script runs test inside the corresponding ROCm docker container. +set -o pipefail + +# Print ROCm version +echo "--- Confirming Clean Initial State" +while true; do + sleep 3 + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + break + fi +done + +echo "--- ROCm info" +rocminfo + +# cleanup older docker images +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} + +# Call the cleanup docker function +cleanup_docker + +echo "--- Resetting GPUs" + +echo "reset" > /opt/amdgpu/etc/gpu_state + +while true; do + sleep 3 + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + break + fi +done + +echo "--- Pulling container" +image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" +container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" +docker pull "${image_name}" + +remove_docker_container() { + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true +} +trap remove_docker_container EXIT + +echo "--- Running container" + +HF_CACHE="$(realpath ~)/huggingface" +mkdir -p "${HF_CACHE}" +HF_MOUNT="/root/.cache/huggingface" + +commands=$@ +echo "Commands:$commands" +#ignore certain kernels tests +if [[ $commands == *" kernels "* ]]; then + commands="${commands} \ + --ignore=kernels/test_attention.py \ + --ignore=kernels/test_attention_selector.py \ + --ignore=kernels/test_blocksparse_attention.py \ + --ignore=kernels/test_causal_conv1d.py \ + --ignore=kernels/test_cutlass.py \ + --ignore=kernels/test_encoder_decoder_attn.py \ + --ignore=kernels/test_flash_attn.py \ + --ignore=kernels/test_flashinfer.py \ + --ignore=kernels/test_gguf.py \ + --ignore=kernels/test_int8_quant.py \ + --ignore=kernels/test_machete_gemm.py \ + --ignore=kernels/test_mamba_ssm.py \ + --ignore=kernels/test_marlin_gemm.py \ + --ignore=kernels/test_moe.py \ + --ignore=kernels/test_prefix_prefill.py \ + --ignore=kernels/test_rand.py \ + --ignore=kernels/test_sampler.py" +fi + +#ignore certain Entrypoints tests +if [[ $commands == *" entrypoints/openai "* ]]; then + commands=${commands//" entrypoints/openai "/" entrypoints/openai \ + --ignore=entrypoints/openai/test_accuracy.py \ + --ignore=entrypoints/openai/test_audio.py \ + --ignore=entrypoints/openai/test_encoder_decoder.py \ + --ignore=entrypoints/openai/test_embedding.py \ + --ignore=entrypoints/openai/test_oot_registration.py "} +fi + +PARALLEL_JOB_COUNT=8 +# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +if [[ $commands == *"--shard-id="* ]]; then + # assign job count as the number of shards used + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do + # assign shard-id for each shard + commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} + echo "Shard ${GPU} commands:$commands_gpu" + docker run \ + --device /dev/kfd --device /dev/dri \ + --network host \ + --shm-size=16gb \ + --rm \ + -e HIP_VISIBLE_DEVICES="${GPU}" \ + -e HF_TOKEN \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}_${GPU}" \ + "${image_name}" \ + /bin/bash -c "${commands_gpu}" \ + |& while read -r line; do echo ">>Shard $GPU: $line"; done & + PIDS+=($!) + done + #wait for all processes to finish and collect exit codes + for pid in "${PIDS[@]}"; do + wait "${pid}" + STATUS+=($?) + done + for st in "${STATUS[@]}"; do + if [[ ${st} -ne 0 ]]; then + echo "One of the processes failed with $st" + exit "${st}" + fi + done +else + docker run \ + --device /dev/kfd --device /dev/dri \ + --network host \ + --shm-size=16gb \ + --rm \ + -e HIP_VISIBLE_DEVICES=0 \ + -e HF_TOKEN \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}" \ + "${image_name}" \ + /bin/bash -c "${commands}" +fi diff --git a/vllm-v0.6.2/.buildkite/run-benchmarks.sh b/vllm-v0.6.2/.buildkite/run-benchmarks.sh new file mode 100644 index 0000000..1641c1f --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-benchmarks.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# This script is run by buildkite to run the benchmarks and upload the results to buildkite + +set -ex +set -o pipefail + +# cd into parent directory of this file +cd "$(dirname "${BASH_SOURCE[0]}")/.." + +(which wget && which curl) || (apt-get update && apt-get install -y wget curl) + +# run python-based benchmarks and upload the result to buildkite +python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt +bench_latency_exit_code=$? + +python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt +bench_throughput_exit_code=$? + +# run server-based benchmarks and upload the result to buildkite +python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & +server_pid=$! +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +# wait for server to start, timeout after 600 seconds +timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 +python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name sharegpt \ + --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ + --model meta-llama/Llama-2-7b-chat-hf \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer meta-llama/Llama-2-7b-chat-hf \ + --save-result \ + 2>&1 | tee benchmark_serving.txt +bench_serving_exit_code=$? +kill $server_pid + +# write the results into a markdown file +echo "### Latency Benchmarks" >> benchmark_results.md +sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line +echo "" >> benchmark_results.md +sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line + +echo "### Throughput Benchmarks" >> benchmark_results.md +sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line +echo "" >> benchmark_results.md +sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line + +echo "### Serving Benchmarks" >> benchmark_results.md +sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line +echo "" >> benchmark_results.md +echo '```' >> benchmark_results.md +tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines +echo '```' >> benchmark_results.md + +# if the agent binary is not found, skip uploading the results, exit 0 +if [ ! -f /usr/bin/buildkite-agent ]; then + exit 0 +fi + +# upload the results to buildkite +buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md + +# exit with the exit code of the benchmarks +if [ $bench_latency_exit_code -ne 0 ]; then + exit $bench_latency_exit_code +fi + +if [ $bench_throughput_exit_code -ne 0 ]; then + exit $bench_throughput_exit_code +fi + +if [ $bench_serving_exit_code -ne 0 ]; then + exit $bench_serving_exit_code +fi + +rm ShareGPT_V3_unfiltered_cleaned_split.json +buildkite-agent artifact upload "*.json" diff --git a/vllm-v0.6.2/.buildkite/run-cpu-test-ppc64le.sh b/vllm-v0.6.2/.buildkite/run-cpu-test-ppc64le.sh new file mode 100755 index 0000000..5d7a0bf --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-cpu-test-ppc64le.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t cpu-test -f Dockerfile.ppc64le . + +# Setup cleanup +remove_docker_container() { docker rm -f cpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image, setting --shm-size=4g for tensor parallel. +source /etc/environment +#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test + +function cpu_tests() { + set -e + + # Run basic model test + docker exec cpu-test bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + + # online inference + docker exec cpu-test bash -c " + set -e + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 25m bash -c "cpu_tests" diff --git a/vllm-v0.6.2/.buildkite/run-cpu-test.sh b/vllm-v0.6.2/.buildkite/run-cpu-test.sh new file mode 100644 index 0000000..14756b5 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-cpu-test.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-48-95} +NUMA_NODE=${NUMA_NODE:-1} + +# Try building the docker image +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . + +# Setup cleanup +remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image, setting --shm-size=4g for tensor parallel. +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 + +function cpu_tests() { + set -e + + # offline inference + docker exec cpu-test-avx2 bash -c " + set -e + python3 examples/offline_inference.py" + + # Run basic model test + docker exec cpu-test bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + + # Run compressed-tensor test + docker exec cpu-test bash -c " + set -e + pytest -s -v \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" + + # Run AWQ test + docker exec cpu-test bash -c " + set -e + pytest -s -v \ + tests/quantization/test_ipex_quant.py" + + # online inference + docker exec cpu-test bash -c " + set -e + export VLLM_CPU_KVCACHE_SPACE=10 + export VLLM_CPU_OMP_THREADS_BIND=$1 + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 25m bash -c "cpu_tests $CORE_RANGE" diff --git a/vllm-v0.6.2/.buildkite/run-hpu-test.sh b/vllm-v0.6.2/.buildkite/run-hpu-test.sh new file mode 100644 index 0000000..4505dc7 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-hpu-test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t hpu-test-env -f Dockerfile.hpu . + +# Setup cleanup +remove_docker_container() { docker rm -f hpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file diff --git a/vllm-v0.6.2/.buildkite/run-multi-node-test.sh b/vllm-v0.6.2/.buildkite/run-multi-node-test.sh new file mode 100755 index 0000000..530bf90 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-multi-node-test.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +set -euox pipefail + +if [[ $# -lt 4 ]]; then + echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" + exit 1 +fi + +WORKING_DIR=$1 +NUM_NODES=$2 +NUM_GPUS=$3 +DOCKER_IMAGE=$4 + +shift 4 +COMMANDS=("$@") +if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then + echo "The number of commands must be equal to the number of nodes." + echo "Number of nodes: $NUM_NODES" + echo "Number of commands: ${#COMMANDS[@]}" + exit 1 +fi + +echo "List of commands" +for command in "${COMMANDS[@]}"; do + echo "$command" +done + +start_network() { + docker network create --subnet=192.168.10.0/24 docker-net +} + +start_nodes() { + for node in $(seq 0 $(($NUM_NODES-1))); do + GPU_DEVICES='"device=' + for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do + DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) + GPU_DEVICES+=$(($DEVICE_NUM)) + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then + GPU_DEVICES+=',' + fi + done + GPU_DEVICES+='"' + + # start the container in detached mode + # things to note: + # 1. --shm-size=10.24gb is required. don't use --ipc=host + # 2. pass HF_TOKEN to the container + # 3. map the huggingface cache directory to the container + # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: + # starting from 192.168.10.11) + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ + -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ + --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ + /bin/bash -c "tail -f /dev/null" + + # organize containers into a ray cluster + if [ "$node" -eq 0 ]; then + # start the ray head node + docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block" + # wait for the head node to be ready + sleep 10 + else + # start the ray worker nodes, and connect them to the head node + docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block" + fi + done + + # wait for the cluster to be ready + sleep 10 + + # print the cluster status + docker exec node0 /bin/bash -c "ray status" +} + +run_nodes() { + # important: iterate in reverse order to start the head node last + # we start the worker nodes first, in detached mode, and then start the head node + # in the foreground, so that the output of the head node is visible in the buildkite logs + for node in $(seq $(($NUM_NODES - 1)) -1 0); do + GPU_DEVICES='"device=' + for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do + DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) + GPU_DEVICES+=$(($DEVICE_NUM)) + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then + GPU_DEVICES+=',' + fi + done + GPU_DEVICES+='"' + echo "Running node$node with GPU devices: $GPU_DEVICES" + if [ "$node" -ne 0 ]; then + docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + else + docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + fi + done +} +cleanup() { + for node in $(seq 0 $(($NUM_NODES-1))); do + docker stop "node$node" + done + docker network rm docker-net +} +trap cleanup EXIT +start_network +start_nodes +run_nodes + diff --git a/vllm-v0.6.2/.buildkite/run-neuron-test.sh b/vllm-v0.6.2/.buildkite/run-neuron-test.sh new file mode 100644 index 0000000..9259391 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-neuron-test.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# This script build the Neuron docker image and run the API server inside the container. +# It serves a sanity check for compilation and basic model usage. +set -e + +# Try building the docker image +aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com + +# prune old image and containers to save disk space, and only once a day +# by using a timestamp file in tmp. +if [ -f /tmp/neuron-docker-build-timestamp ]; then + last_build=$(cat /tmp/neuron-docker-build-timestamp) + current_time=$(date +%s) + if [ $((current_time - last_build)) -gt 86400 ]; then + docker system prune -f + echo "$current_time" > /tmp/neuron-docker-build-timestamp + fi +else + date "+%s" > /tmp/neuron-docker-build-timestamp +fi + +docker build -t neuron -f Dockerfile.neuron . + +# Setup cleanup +remove_docker_container() { docker rm -f neuron || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image +docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ + --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & + +# Wait for the server to start +wait_for_server_to_start() { + timeout=300 + counter=0 + + while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do + sleep 1 + counter=$((counter + 1)) + if [ $counter -ge $timeout ]; then + echo "Timeout after $timeout seconds" + break + fi + done +} +wait_for_server_to_start + +# Test a simple prompt +curl -X POST -H "Content-Type: application/json" \ + localhost:8000/generate \ + -d '{"prompt": "San Francisco is a"}' diff --git a/vllm-v0.6.2/.buildkite/run-openvino-test.sh b/vllm-v0.6.2/.buildkite/run-openvino-test.sh new file mode 100755 index 0000000..6b12f42 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-openvino-test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script build the OpenVINO docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t openvino-test -f Dockerfile.openvino . + +# Setup cleanup +remove_docker_container() { docker rm -f openvino-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py diff --git a/vllm-v0.6.2/.buildkite/run-tpu-test.sh b/vllm-v0.6.2/.buildkite/run-tpu-test.sh new file mode 100644 index 0000000..770dad6 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-tpu-test.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e + +# Build the docker image. +docker build -f Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +remove_docker_container() { docker rm -f tpu-test || true; } +trap remove_docker_container EXIT +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# For HF_TOKEN. +source /etc/environment +# Run a simple end-to-end example. +docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" diff --git a/vllm-v0.6.2/.buildkite/run-xpu-test.sh b/vllm-v0.6.2/.buildkite/run-xpu-test.sh new file mode 100644 index 0000000..faeac8e --- /dev/null +++ b/vllm-v0.6.2/.buildkite/run-xpu-test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t xpu-test -f Dockerfile.xpu . + +# Setup cleanup +remove_docker_container() { docker rm -f xpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py diff --git a/vllm-v0.6.2/.buildkite/test-pipeline.yaml b/vllm-v0.6.2/.buildkite/test-pipeline.yaml new file mode 100644 index 0000000..24bf223 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/test-pipeline.yaml @@ -0,0 +1,536 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + +# Documentation +# label(str): the name of the test. emoji allowed. +# fast_check(bool): whether to run this on each commit on fastcheck pipeline. +# fast_check_only(bool): run this test on fastcheck pipeline only +# nightly(bool): run this test in nightly pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) +# command(str): the single command to run for tests. incompatible with commands. +# commands(list): the list of commands to run for test. incompatbile with command. +# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] +# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 +# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. +# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, +# in this case, commands must be specified. the first command runs on first host, the second +# command runs on the second host. +# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run. + +# When adding a test +# - If the test belong to an existing group, add it there +# - If the test is short, add to any existing step +# - If the test takes more than 10min, then it is okay to create a new step. +# Note that all steps execute in parallel. + +steps: +##### fast check tests ##### + +- label: Documentation Build # 2min + working_dir: "/vllm-workspace/test_docs/docs" + fast_check: true + no_gpu: True + commands: + - pip install -r requirements-docs.txt + - SPHINXOPTS=\"-W\" make html + # Check API reference (if it fails, you may have missing mock imports) + - grep \"sig sig-object py\" build/html/dev/sampling_params.html + +- label: Async Engine, Inputs, Utils, Worker Test # 24min + fast_check: true + source_file_dependencies: + - vllm/ + - tests/mq_llm_engine + - tests/async_engine + - tests/test_inputs + - tests/multimodal + - tests/test_utils + - tests/worker + commands: + - pytest -v -s mq_llm_engine # MQLLMEngine + - pytest -v -s async_engine # AsyncLLMEngine + - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py + - pytest -v -s test_inputs.py + - pytest -v -s multimodal + - pytest -v -s test_utils.py # Utils + - pytest -v -s worker # Worker + +- label: Basic Correctness Test # 30min + #mirror_hardwares: [amd] + fast_check: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_preemption + commands: + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +- label: Chunked Prefill Test + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_chunked_prefill + commands: + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py + +- label: Core Test # 10min + mirror_hardwares: [amd] + fast_check: true + source_file_dependencies: + - vllm/core + - vllm/distributed + - tests/core + commands: + - pytest -v -s core + +- label: Entrypoints Test # 40min + working_dir: "/vllm-workspace/tests" + fast_check: true + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + commands: + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py + - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process + - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s entrypoints/test_chat_utils.py + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Distributed Tests (4 GPUs) # 10min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + fast_check: true + source_file_dependencies: + - vllm/distributed/ + - vllm/core/ + - tests/distributed + - tests/spec_decode/e2e/test_integration_dist_tp4 + - tests/compile + commands: + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + +- label: Metrics, Tracing Test # 10min + num_gpus: 2 + fast_check: true + source_file_dependencies: + - vllm/ + - tests/metrics + - tests/tracing + commands: + - pytest -v -s metrics + - "pip install \ + 'opentelemetry-sdk>=1.26.0,<1.27.0' \ + 'opentelemetry-api>=1.26.0,<1.27.0' \ + 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" + - pytest -v -s tracing + +##### fast check tests ##### +##### 1 GPU test ##### + +- label: Regression Test # 5min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Engine Test # 10min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization + +- label: V1 Test + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1 + +- label: Examples Test # 15min + working_dir: "/vllm-workspace/examples" + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install awscli tensorizer # for llava example and tensorizer test + - python3 offline_inference.py + - python3 cpu_offload.py + - python3 offline_inference_chat.py + - python3 offline_inference_with_prefix.py + - python3 llm_engine_example.py + - python3 offline_inference_vision_language.py + - python3 offline_inference_vision_language_multi_image.py + - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference_encoder_decoder.py + - python3 offline_profile.py --model facebook/opt-125m + +- label: Prefix Caching Test # 9min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/prefix_caching + commands: + - pytest -v -s prefix_caching + +- label: Samplers Test # 36min + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + +- label: LogitsProcessor Test # 5min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/model_executor/layers + - tests/test_logits_processor + command: pytest -v -s test_logits_processor.py + +- label: Speculative decoding tests # 30min + source_file_dependencies: + - vllm/spec_decode + - tests/spec_decode + commands: + - pytest -v -s spec_decode/e2e/test_multistep_correctness.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + +- label: LoRA Test %N # 15min each + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/lora + - tests/lora + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py + parallelism: 4 + +- label: "PyTorch Fullgraph Smoke Test" # 9min + fast_check: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_basic_correctness.py + # these tests need to be separated, cannot combine + - pytest -v -s compile/piecewise/test_simple.py + - pytest -v -s compile/piecewise/test_toy_llama.py + +- label: "PyTorch Fullgraph Test" # 18min + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py + +- label: Kernels Test %N # 1h each + mirror_hardwares: [amd] + source_file_dependencies: + - csrc/ + - vllm/attention + - tests/kernels + commands: + - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 4 + +- label: Tensorizer Test # 11min + mirror_hardwares: [amd] + soft_fail: true + source_file_dependencies: + - vllm/model_executor/model_loader + - tests/tensorizer_loader + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s tensorizer_loader + +- label: Benchmarks # 9min + working_dir: "/vllm-workspace/.buildkite" + mirror_hardwares: [amd] + source_file_dependencies: + - benchmarks/ + commands: + - bash run-benchmarks.sh + +- label: Quantization Test # 33min + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + +- label: LM Eval Small Models # 53min + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 + +- label: Encoder Decoder tests # 5min + source_file_dependencies: + - vllm/ + - tests/encoder_decoder + commands: + - pytest -v -s encoder_decoder + +- label: OpenAI-Compatible Tool Use # 20 min + fast_check: false + mirror_hardwares: [ amd ] + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s tool_use + +##### models test ##### + +- label: Basic Models Test # 30min + source_file_dependencies: + - vllm/ + - tests/models + commands: + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_registry.py + - pytest -v -s models/test_initialization.py + +- label: Language Models Test (Standard) # 42min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language + commands: + - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' + - pytest -v -s models/embedding/language -m core_model + - pytest -v -s models/embedding/vision_language -m core_model + +- label: Language Models Test (Extended) # 50min + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language + commands: + - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/language -m 'not core_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' + +- label: Multi-Modal Models Test (Standard) # 26min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/encoder_decoder/language -m core_model + - pytest -v -s models/encoder_decoder/vision_language -m core_model + +- label: Multi-Modal Models Test (Extended) # 1h15m + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' + # HACK - run phi3v tests separately to sidestep this transformers bug + # https://github.com/huggingface/transformers/issues/34307 + - pytest -v -s models/decoder_only/vision_language/test_phi3v.py + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s models/encoder_decoder/language -m 'not core_model' + - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models Test + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* + +##### 1 GPU test ##### +##### multi gpus test ##### + +- label: Distributed Comm Ops Test # 7min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' + +- label: Distributed Tests (2 GPUs) # 40min + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - vllm/compilation + commands: + - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus + - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus + - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + +- label: Multi-step Tests (4 GPUs) # 36min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/model_executor/layers/sampler.py + - vllm/sequence.py + - vllm/worker/worker_base.py + - vllm/worker/worker.py + - vllm/worker/multi_step_worker.py + - vllm/worker/model_runner_base.py + - vllm/worker/model_runner.py + - vllm/worker/multi_step_model_runner.py + - vllm/engine + - tests/multi_step + commands: + - pytest -v -s multi_step/test_correctness_async_llm.py + - pytest -v -s multi_step/test_correctness_llm.py + +- label: Pipeline Parallelism Test # 45min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA Long Context (Distributed) # 11min + # This test runs llama 13B, so it is required to run on 4 GPUs. + num_gpus: 4 + soft_fail: true + source_file_dependencies: + - vllm/lora + - tests/lora/test_long_context + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_long_context.py + +- label: Weight Loading Multiple GPU Test # 33min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + + +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + gpu: a100 + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 diff --git a/vllm-v0.6.2/.buildkite/upload-wheels.sh b/vllm-v0.6.2/.buildkite/upload-wheels.sh new file mode 100644 index 0000000..541b395 --- /dev/null +++ b/vllm-v0.6.2/.buildkite/upload-wheels.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +# Assume wheels are in artifacts/dist/*.whl +wheel_files=(artifacts/dist/*.whl) + +# Check that exactly one wheel is found +if [[ ${#wheel_files[@]} -ne 1 ]]; then + echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" + exit 1 +fi + +# Get the single wheel file +wheel="${wheel_files[0]}" + +# Rename 'linux' to 'manylinux1' in the wheel filename +new_wheel="${wheel/linux/manylinux1}" +mv -- "$wheel" "$new_wheel" +wheel="$new_wheel" + +# Extract the version from the wheel +version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +echo "Version: $version" + +# If the version contains "dev", rename it to v1.0.0.dev for consistency +if [[ $version == *dev* ]]; then + new_version="1.0.0.dev" + new_wheel="${wheel/$version/$new_version}" + mv -- "$wheel" "$new_wheel" + wheel="$new_wheel" + version="$new_version" +fi + +# Upload the wheel to S3 +aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" +aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file diff --git a/vllm-v0.6.2/.clang-format b/vllm-v0.6.2/.clang-format new file mode 100644 index 0000000..7f9e6d7 --- /dev/null +++ b/vllm-v0.6.2/.clang-format @@ -0,0 +1,26 @@ +BasedOnStyle: Google +UseTab: Never +IndentWidth: 2 +ColumnLimit: 80 + +# Force pointers to the type for C++. +DerivePointerAlignment: false +PointerAlignment: Left + +# Reordering #include statements can (and currently will) introduce errors +SortIncludes: false + +# Style choices +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +IndentPPDirectives: BeforeHash + +IncludeCategories: + - Regex: '^<' + Priority: 4 + - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/' + Priority: 3 + - Regex: '^"(qoda|\.\.)/' + Priority: 2 + - Regex: '.*' + Priority: 1 diff --git a/vllm-v0.6.2/.dockerignore b/vllm-v0.6.2/.dockerignore new file mode 100644 index 0000000..3863656 --- /dev/null +++ b/vllm-v0.6.2/.dockerignore @@ -0,0 +1,33 @@ +/.venv +/build +dist +vllm/*.so + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +.mypy_cache + +# Distribution / packaging +.Python +/build/ +cmake-build-*/ +CMakeUserPresets.json +develop-eggs/ +/dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/vllm-v0.6.2/.github/CODEOWNERS b/vllm-v0.6.2/.github/CODEOWNERS new file mode 100644 index 0000000..cd72197 --- /dev/null +++ b/vllm-v0.6.2/.github/CODEOWNERS @@ -0,0 +1,30 @@ +# See https://help.github.com/articles/about-codeowners/ +# for more info about CODEOWNERS file + +# This lists cover the "core" components of vLLM that require careful review +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +CMakeLists.txt @tlrmchlsmth @WoosukKwon + +# Test ownership +/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo +/tests/test_inputs.py @DarkLight1337 @ywang96 +/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo +/tests/models @DarkLight1337 @ywang96 +/tests/multimodal @DarkLight1337 @ywang96 +/tests/prefix_caching @comaniac @KuntaiDu +/tests/spec_decode @njhill @LiuXiaoxuanPKU +/tests/kernels @tlrmchlsmth @WoosukKwon +/tests/quantization @mgoin @robertgshaw2-neuralmagic +/.buildkite/lm-eval-harness @mgoin @simon-mo +/tests/distributed/test_multi_node_assignment.py @youkaichao +/tests/distributed/test_pipeline_parallel.py @youkaichao +/tests/distributed/test_same_node.py @youkaichao +/tests/multi_step @alexm-neuralmagic @comaniac +/tests/weight_loading @mgoin @youkaichao +/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/vllm-v0.6.2/.github/FUNDING.yml b/vllm-v0.6.2/.github/FUNDING.yml new file mode 100644 index 0000000..71f4e52 --- /dev/null +++ b/vllm-v0.6.2/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: [vllm-project] +open_collective: [vllm] diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/100-documentation.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/100-documentation.yml new file mode 100644 index 0000000..74d397b --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -0,0 +1,29 @@ +name: 📚 Documentation +description: Report an issue related to https://docs.vllm.ai/ +title: "[Doc]: " +labels: ["documentation"] + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in https://docs.vllm.ai/ is an issue. + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/200-installation.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/200-installation.yml new file mode 100644 index 0000000..590e56c --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/200-installation.yml @@ -0,0 +1,47 @@ +name: 🛠️ Installation +description: Report an issue here when you hit errors during installation. +title: "[Installation]: " +labels: ["installation"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How you are installing vllm + description: | + Paste the full command you are trying to execute. + value: | + ```sh + pip install -vvv vllm + ``` +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/300-usage.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/300-usage.yml new file mode 100644 index 0000000..004798a --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/300-usage.yml @@ -0,0 +1,45 @@ +name: 💻 Usage +description: Raise an issue here if you don't know how to use vllm. +title: "[Usage]: " +labels: ["usage"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How would you like to use vllm + description: | + A detailed description of how you want to use vllm. + value: | + I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/400-bug report.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/400-bug report.yml new file mode 100644 index 0000000..30db172 --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/400-bug report.yml @@ -0,0 +1,107 @@ +name: 🐛 Bug report +description: Raise an issue here if you find a bug. +title: "[Bug]: " +labels: ["bug"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. + value: | +
+ The output of `python collect_env.py` + + ```text + Your output of `python collect_env.py` here + ``` + +
+ validations: + required: true +- type: textarea + attributes: + label: Model Input Dumps + description: | + If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process. + placeholder: | + Upload the dumped input file. + validations: + required: false +- type: textarea + attributes: + label: 🐛 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + + If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM(model="facebook/opt-125m") + + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + + Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues. + + If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs. + placeholder: | + A clear and concise description of what the bug is. + + ```python + # Sample code to reproduce the problem + ``` + + ``` + The error message you got, with the full traceback. + ``` + validations: + required: true +- type: markdown + attributes: + value: > + ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + + - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). + + - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. + + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/500-feature request.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/500-feature request.yml new file mode 100644 index 0000000..097d88f --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/500-feature request.yml @@ -0,0 +1,38 @@ +name: 🚀 Feature request +description: Submit a proposal/request for a new vllm feature +title: "[Feature]: " +labels: ["feature request"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: 🚀 The feature, motivation and pitch + description: > + A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A description of any alternative solutions or features you've considered, if any. +- type: textarea + attributes: + label: Additional context + description: > + Add any other context or screenshots about the feature request. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/600-new model.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/600-new model.yml new file mode 100644 index 0000000..794617a --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/600-new model.yml @@ -0,0 +1,40 @@ +name: 🤗 Support request for a new model from huggingface +description: Submit a proposal/request for a new model from huggingface +title: "[New Model]: " +labels: ["new model"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). + + #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. +- type: textarea + attributes: + label: The model to consider. + description: > + A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . + validations: + required: true +- type: textarea + attributes: + label: The closest model vllm already supports. + description: > + Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? +- type: textarea + attributes: + label: What's your difficulty of supporting the model you want? + description: > + For example, any new operators or new architecture? +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/700-performance discussion.yml new file mode 100644 index 0000000..273f50d --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/700-performance discussion.yml @@ -0,0 +1,59 @@ +name: ⚡ Discussion on the performance of vllm +description: Submit a proposal/discussion about the performance of vllm +title: "[Performance]: " +labels: ["performance"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Proposal to improve performance + description: > + How do you plan to improve vllm's performance? + validations: + required: false +- type: textarea + attributes: + label: Report of performance regression + description: > + Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . + validations: + required: false +- type: textarea + attributes: + label: Misc discussion on performance + description: > + Anything about the performance. + validations: + required: false +- type: textarea + attributes: + label: Your current environment (if you think it is necessary) + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: false +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/750-RFC.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/750-RFC.yml new file mode 100644 index 0000000..e447c07 --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -0,0 +1,56 @@ +name: 💬 Request for comments (RFC). +description: Ask for feedback on major architectural changes or design choices. +title: "[RFC]: " +labels: ["RFC"] + +body: +- type: markdown + attributes: + value: > + #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference. +- type: textarea + attributes: + label: Motivation. + description: > + The motivation of the RFC. + validations: + required: true +- type: textarea + attributes: + label: Proposed Change. + description: > + The proposed change of the RFC. + validations: + required: true +- type: textarea + attributes: + label: Feedback Period. + description: > + The feedback period of the RFC. Usually at least one week. + validations: + required: false +- type: textarea + attributes: + label: CC List. + description: > + The list of people you want to CC. + validations: + required: false +- type: textarea + attributes: + label: Any Other Things. + description: > + Any other things you would like to mention. + validations: + required: false +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/800-misc discussion.yml new file mode 100644 index 0000000..79e6e90 --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/800-misc discussion.yml @@ -0,0 +1,28 @@ +name: 🎲 Misc/random discussions that do not fit into the above categories. +description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. +title: "[Misc]: " +labels: ["misc"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Anything you want to discuss about vllm. + description: > + Anything you want to discuss about vllm. + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/vllm-v0.6.2/.github/ISSUE_TEMPLATE/config.yml b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..3ba13e0 --- /dev/null +++ b/vllm-v0.6.2/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/vllm-v0.6.2/.github/PULL_REQUEST_TEMPLATE.md b/vllm-v0.6.2/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..be0afc6 --- /dev/null +++ b/vllm-v0.6.2/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,74 @@ +FILL IN THE PR DESCRIPTION HERE + +FIX #xxxx (*link existing issues this PR will resolve*) + +**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** + +--- + +
+ + PR Checklist (Click to Expand) + +

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

+ +

PR Title and Classification

+

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

+
    +
  • [Bugfix] for bug fixes.
  • +
  • [CI/Build] for build or continuous integration improvements.
  • +
  • [Doc] for documentation fixes and improvements.
  • +
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • +
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • +
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • +
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • +
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • +
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.
  • +
+

Note: If the PR spans more than one category, please include all relevant prefixes.

+ +

Code Quality

+ +

The PR need to meet the following code quality standards:

+ +
    +
  • We adhere to Google Python style guide and Google C++ style guide.
  • +
  • Pass all linter checks. Please use format.sh to format your code.
  • +
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • +
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • +
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.
  • +
+ +

Adding or changing kernels

+

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

+
    +
  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • +
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • +
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • +
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • +
  • If a new custom type is needed, see the following document: Custom Class Support in PT2. +
+ +

Notes for Large Changes

+

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

+ +

What to Expect for the Reviews

+ +

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

+ +
    +
  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • +
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • +
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • +
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. +
  • +
+ +

Thank You

+ +

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

+ + +
+ + diff --git a/vllm-v0.6.2/.github/dependabot.yml b/vllm-v0.6.2/.github/dependabot.yml new file mode 100644 index 0000000..4f54eea --- /dev/null +++ b/vllm-v0.6.2/.github/dependabot.yml @@ -0,0 +1,32 @@ +version: 2 +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + labels: ["dependencies"] + open-pull-requests-limit: 5 + reviewers: ["khluu", "simon-mo"] + allow: + - dependency-type: "all" + ignore: + - dependency-name: "torch" + - dependency-name: "torchvision" + - dependency-name: "xformers" + - dependency-name: "lm-format-enforcer" + - dependency-name: "gguf" + - dependency-name: "compressed-tensors" + - dependency-name: "ray[adag]" + - dependency-name: "lm-eval" + groups: + patch-update: + applies-to: version-updates + update-types: ["patch"] + minor-update: + applies-to: version-updates + update-types: ["minor"] diff --git a/vllm-v0.6.2/.github/mergify.yml b/vllm-v0.6.2/.github/mergify.yml new file mode 100644 index 0000000..ca4bd7e --- /dev/null +++ b/vllm-v0.6.2/.github/mergify.yml @@ -0,0 +1,60 @@ +pull_request_rules: +- name: label-documentation + description: Automatically apply documentation label + conditions: + - or: + - files~=^[^/]+\.md$ + - files~=^docs/ + actions: + label: + add: + - documentation + +- name: label-ci-build + description: Automatically apply ci/build label + conditions: + - or: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py + actions: + label: + add: + - ci/build + +- name: label-frontend + description: Automatically apply frontend label + conditions: + - files~=^vllm/entrypoints/ + actions: + label: + add: + - frontend + +- name: ping author on conflicts and add 'needs-rebase' label + conditions: + - conflict + - -closed + actions: + label: + add: + - needs-rebase + comment: + message: | + This pull request has merge conflicts that must be resolved before it can be + merged. Please rebase the PR, @{{author}}. + + https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + +- name: remove 'needs-rebase' label when conflict is resolved + conditions: + - -conflict + - -closed + actions: + label: + remove: + - needs-rebase diff --git a/vllm-v0.6.2/.github/scripts/cleanup_pr_body.sh b/vllm-v0.6.2/.github/scripts/cleanup_pr_body.sh new file mode 100755 index 0000000..3b2da7b --- /dev/null +++ b/vllm-v0.6.2/.github/scripts/cleanup_pr_body.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -eu + +# ensure 1 argument is passed +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +PR_NUMBER=$1 +OLD=/tmp/orig_pr_body.txt +NEW=/tmp/new_pr_body.txt + +gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" +cp "${OLD}" "${NEW}" + +# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" +sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}" + +# Remove "FIX #xxxx (*link existing issues this PR will resolve*)" +sed -i '/FIX #xxxx.*$/d' "${NEW}" + +# Remove "FILL IN THE PR DESCRIPTION HERE" +sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" + +# Run this only if ${NEW} is different than ${OLD} +if ! cmp -s "${OLD}" "${NEW}"; then + echo "Updating PR body" + gh pr edit --body-file "${NEW}" "${PR_NUMBER}" +else + echo "No changes needed" +fi diff --git a/vllm-v0.6.2/.github/workflows/actionlint.yml b/vllm-v0.6.2/.github/workflows/actionlint.yml new file mode 100644 index 0000000..0226cf0 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/actionlint.yml @@ -0,0 +1,40 @@ +name: Lint GitHub Actions workflows +on: + push: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + pull_request: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Run actionlint" + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color diff --git a/vllm-v0.6.2/.github/workflows/add_label_automerge.yml b/vllm-v0.6.2/.github/workflows/add_label_automerge.yml new file mode 100644 index 0000000..c9d6d42 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/add_label_automerge.yml @@ -0,0 +1,21 @@ +name: Add label on auto-merge enabled +on: + pull_request_target: + types: + - auto_merge_enabled +jobs: + add-label-on-auto-merge: + runs-on: ubuntu-latest + steps: + - name: Add label + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: ['ready'] + }) + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/vllm-v0.6.2/.github/workflows/clang-format.yml b/vllm-v0.6.2/.github/workflows/clang-format.yml new file mode 100644 index 0000000..68149d2 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/clang-format.yml @@ -0,0 +1,53 @@ +name: clang-format + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' + pull_request: + branches: + - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' + +jobs: + clang-format: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install clang-format==18.1.5 + - name: Running clang-format + run: | + EXCLUDES=( + 'csrc/moe/topk_softmax_kernels.cu' + 'csrc/quantization/gguf/ggml-common.h' + 'csrc/quantization/gguf/dequantize.cuh' + 'csrc/quantization/gguf/vecdotq.cuh' + 'csrc/quantization/gguf/mmq.cuh' + 'csrc/quantization/gguf/mmvq.cuh' + ) + find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ + | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ + | xargs clang-format --dry-run --Werror diff --git a/vllm-v0.6.2/.github/workflows/cleanup_pr_body.yml b/vllm-v0.6.2/.github/workflows/cleanup_pr_body.yml new file mode 100644 index 0000000..0085a1c --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/cleanup_pr_body.yml @@ -0,0 +1,26 @@ +name: Cleanup PR Body + +on: + pull_request_target: + types: [opened, reopened, edited] + +permissions: + pull-requests: write + +jobs: + update-description: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: '3.12' + + - name: Update PR description + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" diff --git a/vllm-v0.6.2/.github/workflows/codespell.yml b/vllm-v0.6.2/.github/workflows/codespell.yml new file mode 100644 index 0000000..68887ad --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/codespell.yml @@ -0,0 +1,45 @@ +name: codespell + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - "**/*.py" + - "**/*.md" + - "**/*.rst" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/codespell.yml + pull_request: + branches: + - main + paths: + - "**/*.py" + - "**/*.md" + - "**/*.rst" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/codespell.yml + +jobs: + codespell: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Spelling check with codespell + run: | + codespell --toml pyproject.toml diff --git a/vllm-v0.6.2/.github/workflows/matchers/actionlint.json b/vllm-v0.6.2/.github/workflows/matchers/actionlint.json new file mode 100644 index 0000000..4613e16 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/vllm-v0.6.2/.github/workflows/matchers/mypy.json b/vllm-v0.6.2/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000..f048fce --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/vllm-v0.6.2/.github/workflows/matchers/ruff.json b/vllm-v0.6.2/.github/workflows/matchers/ruff.json new file mode 100644 index 0000000..f6d4479 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/vllm-v0.6.2/.github/workflows/mypy.yaml b/vllm-v0.6.2/.github/workflows/mypy.yaml new file mode 100644 index 0000000..73eeacf --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/mypy.yaml @@ -0,0 +1,51 @@ +name: mypy + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' + pull_request: + branches: + - main + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - '**/*.py' + # - '.github/workflows/mypy.yaml' + # - 'tools/mypy.sh' + # - 'pyproject.toml' + +jobs: + mypy: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mypy==1.11.1 + pip install types-setuptools + pip install types-PyYAML + pip install types-requests + pip install types-setuptools + - name: Mypy + run: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/vllm-v0.6.2/.github/workflows/publish.yml b/vllm-v0.6.2/.github/workflows/publish.yml new file mode 100644 index 0000000..c1051d1 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/publish.yml @@ -0,0 +1,110 @@ +# This workflow will upload a Python Package to Release asset +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions + +name: Create Release + +on: + push: + tags: + - v* + +# Needed to create release and upload assets +permissions: + contents: write + +jobs: + release: + # Retrieve tag and create release + name: Create Release + runs-on: ubuntu-latest + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Extract branch info + shell: bash + run: | + echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV" + + - name: Create Release + id: create_release + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + env: + RELEASE_TAG: ${{ env.release_tag }} + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const script = require('.github/workflows/scripts/create_release.js') + await script(github, context, core) + + wheel: + name: Build Wheel + runs-on: ${{ matrix.os }} + needs: release + + strategy: + fail-fast: false + matrix: + os: ['ubuntu-20.04'] + python-version: ['3.9', '3.10', '3.11', '3.12'] + pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. + cuda-version: ['11.8', '12.1'] + + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Setup ccache + uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 + with: + create-symlink: true + key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} + + - name: Set up Linux Env + if: ${{ runner.os == 'Linux' }} + run: | + bash -x .github/workflows/scripts/env.sh + + - name: Set up Python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install CUDA ${{ matrix.cuda-version }} + run: | + bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }} + + - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }} + run: | + bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }} + + - name: Build wheel + shell: bash + env: + CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size + run: | + bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} + wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename) + asset_name=${wheel_name//"linux"/"manylinux1"} + echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV" + echo "asset_name=${asset_name}" >> "$GITHUB_ENV" + + - name: Upload Release Asset + uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.release.outputs.upload_url }} + asset_path: ./dist/${{ env.wheel_name }} + asset_name: ${{ env.asset_name }} + asset_content_type: application/* + + # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested + # - name: Publish package + # uses: pypa/gh-action-pypi-publish@release/v1.8 + # with: + # repository-url: https://test.pypi.org/legacy/ + # password: ${{ secrets.PYPI_API_TOKEN }} + # skip-existing: true diff --git a/vllm-v0.6.2/.github/workflows/reminder_comment.yml b/vllm-v0.6.2/.github/workflows/reminder_comment.yml new file mode 100644 index 0000000..df62539 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/reminder_comment.yml @@ -0,0 +1,21 @@ +name: PR Reminder Comment Bot +on: + pull_request_target: + types: [opened] + +jobs: + pr_reminder: + runs-on: ubuntu-latest + steps: + - name: Remind to run full CI on PR + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' + }) + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/vllm-v0.6.2/.github/workflows/ruff.yml b/vllm-v0.6.2/.github/workflows/ruff.yml new file mode 100644 index 0000000..7266cc3 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/ruff.yml @@ -0,0 +1,52 @@ +name: ruff + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml + pull_request: + branches: + - main + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - "**/*.py" + # - pyproject.toml + # - requirements-lint.txt + # - .github/workflows/matchers/ruff.json + # - .github/workflows/ruff.yml + +jobs: + ruff: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/vllm-v0.6.2/.github/workflows/scripts/build.sh b/vllm-v0.6.2/.github/workflows/scripts/build.sh new file mode 100644 index 0000000..122e4e1 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/scripts/build.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -eux + +python_executable=python$1 +cuda_home=/usr/local/cuda-$2 + +# Update paths +PATH=${cuda_home}/bin:$PATH +LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH + +# Install requirements +$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt + +# Limit the number of parallel jobs to avoid OOM +export MAX_JOBS=1 +# Make sure release wheels are built for the following architectures +export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" +export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" + +bash tools/check_repo.sh + +# Build +$python_executable setup.py bdist_wheel --dist-dir=dist diff --git a/vllm-v0.6.2/.github/workflows/scripts/create_release.js b/vllm-v0.6.2/.github/workflows/scripts/create_release.js new file mode 100644 index 0000000..d48cc06 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/scripts/create_release.js @@ -0,0 +1,20 @@ +// Uses Github's API to create the release and wait for result. +// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. + +module.exports = async (github, context, core) => { + try { + const response = await github.rest.repos.createRelease({ + draft: false, + generate_release_notes: true, + name: process.env.RELEASE_TAG, + owner: context.repo.owner, + prerelease: true, + repo: context.repo.repo, + tag_name: process.env.RELEASE_TAG, + }); + + core.setOutput('upload_url', response.data.upload_url); + } catch (error) { + core.setFailed(error.message); + } +} diff --git a/vllm-v0.6.2/.github/workflows/scripts/cuda-install.sh b/vllm-v0.6.2/.github/workflows/scripts/cuda-install.sh new file mode 100644 index 0000000..3d0b7a1 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/scripts/cuda-install.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Replace '.' with '-' ex: 11.8 -> 11-8 +cuda_version=$(echo "$1" | tr "." "-") +# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 +OS=$(echo "$2" | tr -d ".\-") + +# Installs CUDA +wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb" +sudo dpkg -i cuda-keyring_1.1-1_all.deb +rm cuda-keyring_1.1-1_all.deb +sudo apt -qq update +sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}" +sudo apt clean + +# Test nvcc +PATH=/usr/local/cuda-$1/bin:${PATH} +nvcc --version + +# Log gcc, g++, c++ versions +gcc --version +g++ --version +c++ --version diff --git a/vllm-v0.6.2/.github/workflows/scripts/env.sh b/vllm-v0.6.2/.github/workflows/scripts/env.sh new file mode 100644 index 0000000..d7baaec --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/scripts/env.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# This file installs common linux environment tools + +export LANG C.UTF-8 + +# python_version=$1 + +sudo apt-get update && \ +sudo apt-get install -y --no-install-recommends \ + software-properties-common \ + +sudo apt-get install -y --no-install-recommends \ + build-essential \ + apt-utils \ + ca-certificates \ + wget \ + git \ + vim \ + libssl-dev \ + curl \ + unzip \ + unrar \ + cmake \ + net-tools \ + sudo \ + autotools-dev \ + rsync \ + jq \ + openssh-server \ + tmux \ + screen \ + htop \ + pdsh \ + openssh-client \ + lshw \ + dmidecode \ + util-linux \ + automake \ + autoconf \ + libtool \ + net-tools \ + pciutils \ + libpci-dev \ + libaio-dev \ + libcap2 \ + libtinfo5 \ + fakeroot \ + devscripts \ + debhelper \ + nfs-common + +# Remove github bloat files to free up disk space +sudo rm -rf "/usr/local/share/boost" +sudo rm -rf "$AGENT_TOOLSDIRECTORY" +sudo rm -rf "/usr/share/dotnet" diff --git a/vllm-v0.6.2/.github/workflows/scripts/pytorch-install.sh b/vllm-v0.6.2/.github/workflows/scripts/pytorch-install.sh new file mode 100644 index 0000000..e3cda7d --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/scripts/pytorch-install.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +python_executable=python$1 +pytorch_version=$2 +cuda_version=$3 + +# Install torch +$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya +$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}" + +# Print version information +$python_executable --version +$python_executable -c "import torch; print('PyTorch:', torch.__version__)" +$python_executable -c "import torch; print('CUDA:', torch.version.cuda)" +$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" diff --git a/vllm-v0.6.2/.github/workflows/shellcheck.yml b/vllm-v0.6.2/.github/workflows/shellcheck.yml new file mode 100644 index 0000000..4b1587e --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/shellcheck.yml @@ -0,0 +1,37 @@ +name: Lint shell scripts +on: + push: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + pull_request: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Check shell scripts" + run: | + tools/shellcheck.sh diff --git a/vllm-v0.6.2/.github/workflows/stale.yml b/vllm-v0.6.2/.github/workflows/stale.yml new file mode 100644 index 0000000..81e7c9b --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/stale.yml @@ -0,0 +1,52 @@ +name: 'Close inactive issues and PRs' + +on: + schedule: + # Daily at 1:30 AM UTC + - cron: '30 1 * * *' + +jobs: + close-issues-and-pull-requests: + permissions: + issues: write + pull-requests: write + actions: write + runs-on: ubuntu-latest + steps: + - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + with: + # Increasing this value ensures that changes to this workflow + # propagate to all issues and PRs in days rather than months + operations-per-run: 1000 + + exempt-draft-pr: true + exempt-issue-labels: 'keep-open' + exempt-pr-labels: 'keep-open' + + labels-to-add-when-unstale: 'unstale' + labels-to-remove-when-stale: 'unstale' + + days-before-issue-stale: 90 + days-before-issue-close: 30 + stale-issue-label: 'stale' + stale-issue-message: > + This issue has been automatically marked as stale because it has not + had any activity within 90 days. It will be automatically closed if no + further activity occurs within 30 days. Leave a comment if + you feel this issue should remain open. Thank you! + close-issue-message: > + This issue has been automatically closed due to inactivity. Please + feel free to reopen if you feel it is still relevant. Thank you! + + days-before-pr-stale: 90 + days-before-pr-close: 30 + stale-pr-label: 'stale' + stale-pr-message: > + This pull request has been automatically marked as stale because it + has not had any activity within 90 days. It will be automatically + closed if no further activity occurs within 30 days. Leave a comment + if you feel this pull request should remain open. Thank you! + close-pr-message: > + This pull request has been automatically closed due to inactivity. + Please feel free to reopen if you intend to continue working on it. + Thank you! diff --git a/vllm-v0.6.2/.github/workflows/yapf.yml b/vllm-v0.6.2/.github/workflows/yapf.yml new file mode 100644 index 0000000..ff441f9 --- /dev/null +++ b/vllm-v0.6.2/.github/workflows/yapf.yml @@ -0,0 +1,38 @@ +name: yapf + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml + pull_request: + branches: + - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml + +jobs: + yapf: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + pip install toml==0.10.2 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/vllm-v0.6.2/.readthedocs.yaml b/vllm-v0.6.2/.readthedocs.yaml new file mode 100644 index 0000000..284196b --- /dev/null +++ b/vllm-v0.6.2/.readthedocs.yaml @@ -0,0 +1,21 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +sphinx: + configuration: docs/source/conf.py + fail_on_warning: true + +# If using Sphinx, optionally build your docs in additional formats such as PDF +formats: [] + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: docs/requirements-docs.txt diff --git a/vllm-v0.6.2/.shellcheckrc b/vllm-v0.6.2/.shellcheckrc new file mode 100644 index 0000000..f3b6eed --- /dev/null +++ b/vllm-v0.6.2/.shellcheckrc @@ -0,0 +1,9 @@ +# rules currently disabled: +# +# SC1091 (info): Not following: was not specified as input (see shellcheck -x) +# SC2004 (style): $/${} is unnecessary on arithmetic variables. +# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects. +# SC2155 (warning): Declare and assign separately to avoid masking return values. +# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails. +# +disable=SC1091,SC2004,SC2129,SC2155,SC2164 diff --git a/vllm-v0.6.2/.yapfignore b/vllm-v0.6.2/.yapfignore new file mode 100644 index 0000000..2d6dcf8 --- /dev/null +++ b/vllm-v0.6.2/.yapfignore @@ -0,0 +1 @@ +collect_env.py diff --git a/vllm-v0.6.2/CMakeLists.txt b/vllm-v0.6.2/CMakeLists.txt new file mode 100644 index 0000000..5acbd76 --- /dev/null +++ b/vllm-v0.6.2/CMakeLists.txt @@ -0,0 +1,546 @@ +cmake_minimum_required(VERSION 3.26) + +# When building directly using CMake, make sure you run the install step +# (it places the .so files in the correct location). +# +# Example: +# mkdir build && cd build +# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. .. +# cmake --build . --target install +# +# If you want to only build one target, make sure to install it manually: +# cmake --build . --target _C +# cmake --install . --component _C +project(vllm_extensions LANGUAGES CXX) + +# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) +set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") + +message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") +message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) + +# Suppress potential warnings about unused manually-specified variables +set(ignoreMe "${VLLM_PYTHON_PATH}") + +# Prevent installation of dependencies (cutlass) by default. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + +# +# Supported python versions. These versions will be searched in order, the +# first match will be selected. These should be kept in sync with setup.py. +# +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") + +# Supported NVIDIA architectures. +set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") + +# Supported AMD GPU architectures. +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") + +# +# Supported/expected torch versions for CUDA/ROCm. +# +# Currently, having an incorrect pytorch version results in a warning +# rather than an error. +# +# Note: the CUDA torch version is derived from pyproject.toml and various +# requirements.txt files and should be kept consistent. The ROCm torch +# versions are derived from Dockerfile.rocm +# +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") + +# +# Try to find python package with an executable that exactly matches +# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions. +# +if (VLLM_PYTHON_EXECUTABLE) + find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}") +else() + message(FATAL_ERROR + "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version" + " before running cmake configure.") +endif() + +# +# Update cmake's `CMAKE_PREFIX_PATH` with torch location. +# +append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") + +# Ensure the 'nvcc' command is in the PATH +find_program(NVCC_EXECUTABLE nvcc) +if (CUDA_FOUND AND NOT NVCC_EXECUTABLE) + message(FATAL_ERROR "nvcc not found") +endif() + +# +# Import torch cmake configuration. +# Torch also imports CUDA (and partially HIP) languages with some customizations, +# so there is no need to do this explicitly with check_language/enable_language, +# etc. +# +find_package(Torch REQUIRED) + +# +# Forward the non-CUDA device extensions to external CMake scripts. +# +if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND + NOT VLLM_TARGET_DEVICE STREQUAL "rocm") + if (VLLM_TARGET_DEVICE STREQUAL "cpu") + include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake) + else() + return() + endif() + return() +endif() + +# +# Set up GPU language and check the torch version and warn if it isn't +# what is expected. +# +if (NOT HIP_FOUND AND CUDA_FOUND) + set(VLLM_GPU_LANG "CUDA") + + if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} " + "expected for CUDA build, saw ${Torch_VERSION} instead.") + endif() +elseif(HIP_FOUND) + set(VLLM_GPU_LANG "HIP") + + # Importing torch recognizes and sets up some HIP/ROCm configuration but does + # not let cmake recognize .hip files. In order to get cmake to understand the + # .hip extension automatically, HIP must be enabled explicitly. + enable_language(HIP) + + # ROCm 5.X and 6.X + if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} " + "expected for ROCm build, saw ${Torch_VERSION} instead.") + endif() +else() + message(FATAL_ERROR "Can't find CUDA or HIP installation.") +endif() + + +if(VLLM_GPU_LANG STREQUAL "CUDA") + # + # For cuda we want to be able to control which architectures we compile for on + # a per-file basis in order to cut down on compile time. So here we extract + # the set of architectures we want to compile for and remove the from the + # CMAKE_CUDA_FLAGS so that they are not applied globally. + # + clear_cuda_arches(CUDA_ARCH_FLAGS) + extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}") + message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") + # Filter the target architectures by the supported supported archs + # since for some files we will build for all CUDA_ARCHS. + cuda_archs_loose_intersection(CUDA_ARCHS + "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") + message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") +else() + # + # For other GPU targets override the GPU architectures detected by cmake/torch + # and filter them by the supported versions for the current language. + # The final set of arches is stored in `VLLM_GPU_ARCHES`. + # + override_gpu_arches(VLLM_GPU_ARCHES + ${VLLM_GPU_LANG} + "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") +endif() + +# +# Query torch for additional GPU compilation flags for the given +# `VLLM_GPU_LANG`. +# The final set of arches is stored in `VLLM_GPU_FLAGS`. +# +get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG}) + +# +# Set nvcc parallelism. +# +if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") +endif() + + +# +# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. +# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. +# Each dependency that produces build artifacts should override its BINARY_DIR to avoid +# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/. +# +include(FetchContent) +file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists +message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") + +# +# Define other extension targets +# + +# +# _C extension +# + +set(VLLM_EXT_SRC + "csrc/cache_kernels.cu" + "csrc/attention/paged_attention_v1.cu" + "csrc/attention/paged_attention_v2.cu" + "csrc/pos_encoding_kernels.cu" + "csrc/activation_kernels.cu" + "csrc/layernorm_kernels.cu" + "csrc/layernorm_quant_kernels.cu" + "csrc/quantization/gptq/q_gemm.cu" + "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" + "csrc/quantization/fp8/common.cu" + "csrc/cuda_utils_kernels.cu" + "csrc/prepare_inputs/advance_step.cu" + "csrc/torch_bindings.cpp") + +if(VLLM_GPU_LANG STREQUAL "CUDA") + SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") + + # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. + set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use") + + FetchContent_Declare( + cutlass + GIT_REPOSITORY https://github.com/nvidia/cutlass.git + GIT_TAG v3.5.1 + GIT_PROGRESS TRUE + + # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. + # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. + # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE + GIT_SHALLOW TRUE + ) + FetchContent_MakeAvailable(cutlass) + + list(APPEND VLLM_EXT_SRC + "csrc/mamba/mamba_ssm/selective_scan_fwd.cu" + "csrc/mamba/causal_conv1d/causal_conv1d.cu" + "csrc/quantization/aqlm/gemm_kernels.cu" + "csrc/quantization/awq/gemm_kernels.cu" + "csrc/quantization/gguf/gguf_kernel.cu" + "csrc/custom_all_reduce.cu" + "csrc/permute_cols.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu") + + set_gencode_flags_for_srcs( + SRCS "${VLLM_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + + # Only build Marlin kernels if we are building for at least some compatible archs. + # Keep building Marlin for 9.0 as there are some group sizes and shapes that + # are not supported by Machete yet. + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) + if (MARLIN_ARCHS) + set(MARLIN_SRCS + "csrc/quantization/fp8/fp8_marlin.cu" + "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" + "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" + "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" + "csrc/quantization/gptq_marlin/gptq_marlin.cu" + "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" + "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_SRCS}" + CUDA_ARCHS "${MARLIN_ARCHS}") + list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") + message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") + else() + message(STATUS "Not building Marlin kernels as no compatible archs found" + " in CUDA target architectures") + endif() + + # + # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require + # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). + cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) + set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") + message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) + message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " + "not >= 12.0, we recommend upgrading to CUDA 12.0 or " + "later if you intend on running FP8 quantized models on " + "Hopper.") + else() + message(STATUS "Not building scaled_mm_c3x as no compatible archs found " + "in CUDA target architectures") + endif() + + # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't + # build any 3x kernels + set(SCALED_MM_3X_ARCHS) + endif() + + # + # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) + # kernels for the remaining archs that are not already built for 3x. + cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS + "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") + # subtract out the archs that are already built for 3x + list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) + if (SCALED_MM_2X_ARCHS) + set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1") + message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}") + else() + if (SCALED_MM_3X_ARCHS) + message(STATUS "Not building scaled_mm_c2x as all archs are already built" + " for and covered by scaled_mm_c3x") + else() + message(STATUS "Not building scaled_mm_c2x as no compatible archs found " + "in CUDA target architectures") + endif() + endif() + + + # + # Machete kernels + + # The machete kernels only work on hopper and require CUDA 12.0 or later. + # Only build Machete kernels if we are building for something compatible with sm90a + cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) + # + # For the Machete kernels we automatically generate sources for various + # preselected input type pairs and schedules. + # Generate sources: + set(MACHETE_GEN_SCRIPT + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) + file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) + + message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}") + message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}") + + if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} + OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} + RESULT_VARIABLE machete_generation_result + OUTPUT_VARIABLE machete_generation_output + OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log + ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log + ) + + if (NOT machete_generation_result EQUAL 0) + message(FATAL_ERROR "Machete generation failed." + " Result: \"${machete_generation_result}\"" + "\nCheck the log for details: " + "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") + else() + set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} + CACHE STRING "Last run machete generate script hash" FORCE) + message(STATUS "Machete generation completed successfully.") + endif() + else() + message(STATUS "Machete generation script has not changed, skipping generation.") + endif() + + # Add machete generated sources + file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") + list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) + + # forward compatible + set_gencode_flags_for_srcs( + SRCS "${MACHETE_GEN_SOURCES}" + CUDA_ARCHS "${MACHETE_ARCHS}") + + list(APPEND VLLM_EXT_SRC + csrc/quantization/machete/machete_pytorch.cu) + + message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 + AND MACHETE_ARCHS) + message(STATUS "Not building Machete kernels as CUDA Compiler version is " + "not >= 12.0, we recommend upgrading to CUDA 12.0 or " + "later if you intend on running w4a16 quantized models on " + "Hopper.") + else() + message(STATUS "Not building Machete kernels as no compatible archs " + "found in CUDA target architectures") + endif() + endif() +# if CUDA endif +endif() + +message(STATUS "Enabling C extension.") +define_gpu_extension_target( + _C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} + USE_SABI 3 + WITH_SOABI) + +# If CUTLASS is compiled on NVCC >= 12.5, it by default uses +# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the +# driver API. This causes problems when linking with earlier versions of CUDA. +# Setting this variable sidesteps the issue by calling the driver directly. +target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) + +# +# _moe_C extension +# + +set(VLLM_MOE_EXT_SRC + "csrc/moe/torch_bindings.cpp" + "csrc/moe/moe_align_sum_kernels.cu" + "csrc/moe/topk_softmax_kernels.cu") + +set_gencode_flags_for_srcs( + SRCS "${VLLM_MOE_EXT_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + +if(VLLM_GPU_LANG STREQUAL "CUDA") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") + if (MARLIN_MOE_ARCHS) + set(MARLIN_MOE_SRC + "csrc/moe/marlin_kernels/marlin_moe_kernel.h" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h" + "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu" + "csrc/moe/marlin_moe_ops.cu") + + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_SRC}" + CUDA_ARCHS "${MARLIN_MOE_ARCHS}") + + list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}") + message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") + else() + message(STATUS "Not building Marlin MOE kernels as no compatible archs found" + " in CUDA target architectures") + endif() +endif() + +message(STATUS "Enabling moe extension.") +define_gpu_extension_target( + _moe_C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_MOE_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + USE_SABI 3 + WITH_SOABI) + +if(VLLM_GPU_LANG STREQUAL "HIP") + # + # _rocm_C extension + # + set(VLLM_ROCM_EXT_SRC + "csrc/rocm/torch_bindings.cpp" + "csrc/rocm/attention.cu") + + define_gpu_extension_target( + _rocm_C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_ROCM_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + USE_SABI 3 + WITH_SOABI) +endif() + +# vllm-flash-attn currently only supported on CUDA +if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") + return() +endif () + +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) +# we need to manually set VLLM_GPU_ARCHES here. +if(VLLM_GPU_LANG STREQUAL "CUDA") + foreach(_ARCH ${CUDA_ARCHS}) + string(REPLACE "." "" _ARCH "${_ARCH}") + list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real") + endforeach() +endif() + +# +# Build vLLM flash attention from source +# +# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM. +# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs. +# They should be identical but if they aren't, this is a massive footgun. +# +# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. +# To only install vllm-flash-attn, use --component vllm_flash_attn_c. +# If no component is specified, vllm-flash-attn is still installed. + +# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. +# This is to enable local development of vllm-flash-attn within vLLM. +# It can be set as an environment variable or passed as a cmake argument. +# The environment variable takes precedence. +if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) + set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR}) +endif() + +if(VLLM_FLASH_ATTN_SRC_DIR) + FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR}) +else() + FetchContent_Declare( + vllm-flash-attn + GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git + GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 + GIT_PROGRESS TRUE + # Don't share the vllm-flash-attn build between build types + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn + ) +endif() + +# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization. +set(VLLM_PARENT_BUILD ON) + +# Ensure the vllm/vllm_flash_attn directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c) + +# Make sure vllm-flash-attn install rules are nested under vllm/ +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c) +install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c) + +# Fetch the vllm-flash-attn library +FetchContent_MakeAvailable(vllm-flash-attn) +message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") + +# Restore the install prefix +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c) + +# Copy over the vllm-flash-attn python files +install( + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm/vllm_flash_attn + COMPONENT vllm_flash_attn_c + FILES_MATCHING PATTERN "*.py" +) + +# Nothing after vllm-flash-attn, see comment about macros above diff --git a/vllm-v0.6.2/CODE_OF_CONDUCT.md b/vllm-v0.6.2/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f801b5f --- /dev/null +++ b/vllm-v0.6.2/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ + +# vLLM Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socioeconomic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline/IRL event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement in the #code-of-conduct +channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), +version 2.1, available at +[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the +[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at +[Contributor Covenant translations](https://www.contributor-covenant.org/translations). + diff --git a/vllm-v0.6.2/CONTRIBUTING.md b/vllm-v0.6.2/CONTRIBUTING.md new file mode 100644 index 0000000..6d46a6d --- /dev/null +++ b/vllm-v0.6.2/CONTRIBUTING.md @@ -0,0 +1,3 @@ +# Contributing to vLLM + +You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). diff --git a/vllm-v0.6.2/DCO b/vllm-v0.6.2/DCO new file mode 100644 index 0000000..49b8cb0 --- /dev/null +++ b/vllm-v0.6.2/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/vllm-v0.6.2/Dockerfile b/vllm-v0.6.2/Dockerfile new file mode 100644 index 0000000..220dbe2 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile @@ -0,0 +1,222 @@ +# The vLLM Dockerfile is used to construct vLLM image that can be directly used +# to run the OpenAI compatible server. + +# Please update any changes made here to +# docs/source/dev/dockerfile/dockerfile.rst and +# docs/source/assets/dev/dockerfile-stages-dependency.png + +ARG CUDA_VERSION=12.4.1 +#################### BASE BUILD IMAGE #################### +# prepare basic build environment +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base +ARG CUDA_VERSION=12.4.1 +ARG PYTHON_VERSION=3.12 +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python and other dependencies +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl sudo \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 +# as it was causing spam when compiling the CUTLASS kernels +RUN apt-get install -y gcc-10 g++-10 +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 +RUN <> /etc/environment + +# Install Python and other dependencies +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +# Workaround for https://github.com/openai/triton/issues/2507 and +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully +# this won't be needed for future versions of this docker image +# or future versions of triton. +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ + +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install dist/*.whl --verbose + +RUN --mount=type=cache,target=/root/.cache/pip \ + . /etc/environment && \ + python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl +COPY examples examples +#################### vLLM installation IMAGE #################### + + +#################### TEST IMAGE #################### +# image to run unit testing suite +# note that this uses vllm installed by `pip` +FROM vllm-base AS test + +ADD . /vllm-workspace/ + +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-dev.txt + +# enable fast downloads from hf (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install hf_transfer +ENV HF_HUB_ENABLE_HF_TRANSFER 1 + +# Copy in the v1 package for testing (it isn't distributed yet) +COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 + +# doc requires source code +# we hide them inside `test_docs/` , so that this source code +# will not be imported by other tests +RUN mkdir test_docs +RUN mv docs test_docs/ +RUN mv vllm test_docs/ + +#################### TEST IMAGE #################### + +#################### OPENAI API SERVER #################### +# openai api server alternative +FROM vllm-base AS vllm-openai + +# install additional dependencies for openai api server +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10 + +ENV VLLM_USAGE_SOURCE production-docker-image + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +#################### OPENAI API SERVER #################### diff --git a/vllm-v0.6.2/Dockerfile.cpu b/vllm-v0.6.2/Dockerfile.cpu new file mode 100644 index 0000000..287b495 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.cpu @@ -0,0 +1,65 @@ +# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. + +FROM ubuntu:22.04 AS cpu-test-1 + +ENV CCACHE_DIR=/root/.cache/ccache + +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache + +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html +# intel-openmp provides additional performance improvement vs. openmp +# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install intel-openmp + +ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so" + +RUN echo 'ulimit -c 0' >> ~/.bashrc + +RUN pip install intel_extension_for_pytorch==2.5.0 + +WORKDIR /workspace + +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + pip install --upgrade pip && \ + pip install -r requirements-build.txt + +FROM cpu-test-1 AS build + +WORKDIR /workspace/vllm + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ + pip install -v -r requirements-cpu.txt + +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... +ARG VLLM_CPU_DISABLE_AVX512 +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=bind,source=.git,target=.git \ + VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ + pip install dist/*.whl && \ + rm -rf dist + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm-v0.6.2/Dockerfile.hpu b/vllm-v0.6.2/Dockerfile.hpu new file mode 100644 index 0000000..d18fc01 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.hpu @@ -0,0 +1,18 @@ +FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-hpu.txt + +ENV no_proxy=localhost,127.0.0.1 +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm-v0.6.2/Dockerfile.neuron b/vllm-v0.6.2/Dockerfile.neuron new file mode 100644 index 0000000..2143315 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.neuron @@ -0,0 +1,41 @@ +# default base image +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04" + +FROM $BASE_IMAGE + +RUN echo "Base image is $BASE_IMAGE" + +# Install some basic utilities +RUN apt-get update && \ + apt-get install -y \ + git \ + python3 \ + python3-pip \ + ffmpeg libsm6 libxext6 libgl1 + +### Mount Point ### +# When launching the container, mount the code directory to /app +ARG APP_MOUNT=/app +VOLUME [ ${APP_MOUNT} ] +WORKDIR ${APP_MOUNT}/vllm + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas +RUN python3 -m pip install sentencepiece transformers==4.36.2 -U +RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U + +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +RUN python3 -m pip install -U \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + -r requirements-neuron.txt + +ENV VLLM_TARGET_DEVICE neuron +RUN --mount=type=bind,source=.git,target=.git \ + pip install --no-build-isolation -v -e . + +CMD ["/bin/bash"] diff --git a/vllm-v0.6.2/Dockerfile.openvino b/vllm-v0.6.2/Dockerfile.openvino new file mode 100644 index 0000000..a05ff45 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.openvino @@ -0,0 +1,25 @@ +# The vLLM Dockerfile is used to construct vLLM image that can be directly used +# to run the OpenAI compatible server. + +FROM ubuntu:22.04 AS dev + +RUN apt-get update -y && \ + apt-get install -y \ + git python3-pip \ + ffmpeg libsm6 libxext6 libgl1 +WORKDIR /workspace + +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +# install build requirements +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt +# build vLLM with OpenVINO backend +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace + +COPY examples/ /workspace/examples +COPY benchmarks/ /workspace/benchmarks + +CMD ["/bin/bash"] diff --git a/vllm-v0.6.2/Dockerfile.ppc64le b/vllm-v0.6.2/Dockerfile.ppc64le new file mode 100644 index 0000000..b19c6dd --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.ppc64le @@ -0,0 +1,36 @@ +FROM mambaorg/micromamba +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +USER root + +ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" + +RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 + +# Some packages in requirements-cpu are installed here +# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba +# Currently these may not be available for venv or pip directly +RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi + +# These packages will be in rocketce eventually +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + torch==2.3.1 \ + -r requirements-cpu.txt \ + xformers uvloop==0.20.0 + +RUN --mount=type=bind,source=.git,target=.git \ + VLLM_TARGET_DEVICE=cpu python3 setup.py install + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm-v0.6.2/Dockerfile.rocm b/vllm-v0.6.2/Dockerfile.rocm new file mode 100644 index 0000000..8fb79af --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.rocm @@ -0,0 +1,171 @@ +# Default ROCm 6.2 base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0" + +# Default ROCm ARCHes to build vLLM for. +ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" + +# Whether to install CK-based flash-attention +# If 0, will not install flash-attention +ARG BUILD_FA="1" +ARG FA_GFX_ARCHS="gfx90a;gfx942" +ARG FA_BRANCH="3cea2fb" + +# Whether to build triton on rocm +ARG BUILD_TRITON="1" +ARG TRITON_BRANCH="e192dba" + +### Base image build stage +FROM $BASE_IMAGE AS base + +# Import arg(s) defined before this build stage +ARG PYTORCH_ROCM_ARCH + +# Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y +RUN apt-get update && apt-get install -y \ + curl \ + ca-certificates \ + sudo \ + git \ + bzip2 \ + libx11-6 \ + build-essential \ + wget \ + unzip \ + tmux \ + ccache \ + && rm -rf /var/lib/apt/lists/* + +# When launching the container, mount the code directory to /vllm-workspace +ARG APP_MOUNT=/vllm-workspace +WORKDIR ${APP_MOUNT} + +RUN python3 -m pip install --upgrade pip +# Remove sccache so it doesn't interfere with ccache +# TODO: implement sccache support across components +RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" + +# Install torch == 2.6.0 on ROCm +RUN --mount=type=cache,target=/root/.cache/pip \ + case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-6.2"*) \ + python3 -m pip uninstall -y torch torchvision \ + && python3 -m pip install --pre \ + torch==2.6.0.dev20240918 \ + 'setuptools-scm>=8' \ + torchvision==0.20.0.dev20240918 \ + --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ + *) ;; esac + +ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer +ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: +ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: + +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV CCACHE_DIR=/root/.cache/ccache + + +### AMD-SMI build stage +FROM base AS build_amdsmi +# Build amdsmi wheel always +RUN cd /opt/rocm/share/amd_smi \ + && python3 -m pip wheel . --wheel-dir=/install + + +### Flash-Attention wheel build stage +FROM base AS build_fa +ARG BUILD_FA +ARG FA_GFX_ARCHS +ARG FA_BRANCH +# Build ROCm flash-attention wheel if `BUILD_FA = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_FA" = "1" ]; then \ + mkdir -p libs \ + && cd libs \ + && git clone https://github.com/ROCm/flash-attention.git \ + && cd flash-attention \ + && git checkout "${FA_BRANCH}" \ + && git submodule update --init \ + && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ + fi + + +### Triton wheel build stage +FROM base AS build_triton +ARG BUILD_TRITON +ARG TRITON_BRANCH +# Build triton wheel if `BUILD_TRITON = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_TRITON" = "1" ]; then \ + mkdir -p libs \ + && cd libs \ + && python3 -m pip install ninja cmake wheel pybind11 \ + && git clone https://github.com/OpenAI/triton.git \ + && cd triton \ + && git checkout "${TRITON_BRANCH}" \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ + fi + + +### Final vLLM build stage +FROM base AS final +# Import the vLLM development directory from the build context +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +RUN python3 -m pip install --upgrade pip + +# Package upgrades for useful functionality or to avoid dependency issues +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard + + +# Workaround for ray >= 2.10.0 +ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +# Silences the HF Tokenizers warning +ENV TOKENIZERS_PARALLELISM=false + +RUN --mount=type=cache,target=${CCACHE_DIR} \ + --mount=type=bind,source=.git,target=.git \ + --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -Ur requirements-rocm.txt \ + && python3 setup.py clean --all \ + && python3 setup.py develop + +# Copy amdsmi wheel into final image +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ + mkdir -p libs \ + && cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && python3 -m pip uninstall -y amdsmi; + +# Copy triton wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && python3 -m pip uninstall -y triton; fi + +# Copy flash-attn wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && python3 -m pip uninstall -y flash-attn; fi + +# Install wheels that were built to the final image +RUN --mount=type=cache,target=/root/.cache/pip \ + if ls libs/*.whl; then \ + python3 -m pip install libs/*.whl; fi + +CMD ["/bin/bash"] diff --git a/vllm-v0.6.2/Dockerfile.tpu b/vllm-v0.6.2/Dockerfile.tpu new file mode 100644 index 0000000..0a507b6 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.tpu @@ -0,0 +1,25 @@ +ARG NIGHTLY_DATE="20241017" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" + +FROM $BASE_IMAGE +WORKDIR /workspace/vllm + +# Install some basic utilities +RUN apt-get update && apt-get install -y \ + git \ + ffmpeg libsm6 libxext6 libgl1 + +# Build vLLM. +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi + +ENV VLLM_TARGET_DEVICE="tpu" +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + python3 -m pip install \ + -r requirements-tpu.txt +RUN python3 setup.py develop + +CMD ["/bin/bash"] diff --git a/vllm-v0.6.2/Dockerfile.xpu b/vllm-v0.6.2/Dockerfile.xpu new file mode 100644 index 0000000..63bc682 --- /dev/null +++ b/vllm-v0.6.2/Dockerfile.xpu @@ -0,0 +1,68 @@ +FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ + chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ + echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ + chmod 644 /usr/share/keyrings/intel-graphics.gpg + +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends --fix-missing \ + curl \ + ffmpeg \ + git \ + libsndfile1 \ + libsm6 \ + libxext6 \ + libgl1 \ + lsb-release \ + numactl \ + python3 \ + python3-dev \ + python3-pip \ + # vim \ + wget + +WORKDIR /workspace/vllm +COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt +COPY requirements-common.txt /workspace/vllm/requirements-common.txt + +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --no-cache-dir \ + -r requirements-xpu.txt + +RUN git clone https://github.com/intel/pti-gpu && \ + cd pti-gpu/sdk && \ + git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ + make -j && \ + cmake --install . --config Release --prefix "/usr/local" + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" + +COPY . . +ARG GIT_REPO_CHECK +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi + +ENV VLLM_TARGET_DEVICE=xpu + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=.git,target=.git \ + python3 setup.py install + +CMD ["/bin/bash"] + +FROM vllm-base AS vllm-openai + +# install additional dependencies for openai api server +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' + +ENV VLLM_USAGE_SOURCE production-docker-image \ + TRITON_XPU_PROFILE 1 + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm-v0.6.2/LICENSE b/vllm-v0.6.2/LICENSE new file mode 100644 index 0000000..2a047d6 --- /dev/null +++ b/vllm-v0.6.2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Modifications made by Cambricon Technologies Corporation Limited. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vllm-v0.6.2/MANIFEST.in b/vllm-v0.6.2/MANIFEST.in new file mode 100644 index 0000000..82be639 --- /dev/null +++ b/vllm-v0.6.2/MANIFEST.in @@ -0,0 +1,10 @@ +include LICENSE +include requirements-common.txt +include requirements-cuda.txt +include requirements-rocm.txt +include requirements-neuron.txt +include requirements-cpu.txt +include CMakeLists.txt + +recursive-include cmake * +recursive-include csrc * diff --git a/vllm-v0.6.2/README.md b/vllm-v0.6.2/README.md new file mode 100644 index 0000000..53749cb --- /dev/null +++ b/vllm-v0.6.2/README.md @@ -0,0 +1,140 @@ +

+ + + vLLM + +

+ +

+Easy, fast, and cheap LLM serving for everyone +

+ +

+| Documentation | Blog | Paper | Discord | Twitter/X | + +

+ + +--- + +**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco** + +We are excited to announce our special vLLM event in collaboration with AMD and Anyscale. +Join us to learn more about recent advancements of vLLM on MI300X. +Register [here](https://lu.ma/db5ld9n5) and be a part of the event! + +--- + +*Latest News* 🔥 +- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). +- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). +- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). +- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). +- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). +- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). +- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). +- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM. +- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). + +--- +## About +vLLM is a fast and easy-to-use library for LLM inference and serving. + +vLLM is fast with: + +- State-of-the-art serving throughput +- Efficient management of attention key and value memory with **PagedAttention** +- Continuous batching of incoming requests +- Fast model execution with CUDA/HIP graph +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Speculative decoding +- Chunked prefill + +**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). + +vLLM is flexible and easy to use with: + +- Seamless integration with popular Hugging Face models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +- Tensor parallelism and pipeline parallelism support for distributed inference +- Streaming outputs +- OpenAI-compatible API server +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron. +- Prefix caching support +- Multi-lora support + +vLLM seamlessly supports most popular open-source models on HuggingFace, including: +- Transformer-like LLMs (e.g., Llama) +- Mixture-of-Expert LLMs (e.g., Mixtral) +- Embedding Models (e.g. E5-Mistral) +- Multi-modal LLMs (e.g., LLaVA) + +Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). + +## Getting Started + +Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): + +```bash +pip install vllm +``` + +Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more. +- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) +- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) +- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) + +## Contributing + +We welcome and value any contributions and collaborations. +Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved. + +## Sponsors + +vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! + + + + +- a16z +- AMD +- Anyscale +- AWS +- Crusoe Cloud +- Databricks +- DeepInfra +- Dropbox +- Google Cloud +- Lambda Lab +- NVIDIA +- Replicate +- Roblox +- RunPod +- Sequoia Capital +- Skywork AI +- Trainy +- UC Berkeley +- UC San Diego +- ZhenFund + +We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. + +## Citation + +If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): +```bibtex +@inproceedings{kwon2023efficient, + title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, + author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica}, + booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, + year={2023} +} +``` + +## Contact Us + +* For technical questions and feature requests, please use Github issues or discussions. +* For discussing with fellow users, please use Discord. +* For security disclosures, please use Github's security advisory feature. +* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. \ No newline at end of file diff --git a/vllm-v0.6.2/SECURITY.md b/vllm-v0.6.2/SECURITY.md new file mode 100644 index 0000000..ad3f1f1 --- /dev/null +++ b/vllm-v0.6.2/SECURITY.md @@ -0,0 +1,11 @@ +# Security Policy + +## Reporting a Vulnerability + +If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. + +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). + +--- + +Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. diff --git a/vllm-v0.6.2/benchmarks/README.md b/vllm-v0.6.2/benchmarks/README.md new file mode 100644 index 0000000..2aa4a28 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/README.md @@ -0,0 +1,19 @@ +# Benchmarking vLLM + +## Downloading the ShareGPT dataset + +You can download the dataset by running: +```bash +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` diff --git a/vllm-v0.6.2/benchmarks/backend_request_func.py b/vllm-v0.6.2/benchmarks/backend_request_func.py new file mode 100644 index 0000000..25c8b1b --- /dev/null +++ b/vllm-v0.6.2/benchmarks/backend_request_func.py @@ -0,0 +1,433 @@ +import json +import os +import sys +import time +import traceback +from dataclasses import dataclass, field +from typing import List, Optional, Union + +import aiohttp +import huggingface_hub.constants +from tqdm.asyncio import tqdm +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +@dataclass +class RequestFuncInput: + prompt: str + api_url: str + prompt_len: int + output_len: int + model: str + best_of: int = 1 + logprobs: Optional[int] = None + multi_modal_content: Optional[dict] = None + ignore_eos: bool = False + + +@dataclass +class RequestFuncOutput: + generated_text: str = "" + success: bool = False + latency: float = 0.0 + ttft: float = 0.0 # Time to first token + itl: List[float] = field( + default_factory=list) # List of inter-token latencies + prompt_len: int = 0 + error: str = "" + + +async def async_request_tgi( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith("generate_stream") + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + params = { + "best_of": request_func_input.best_of, + "max_new_tokens": request_func_input.output_len, + "do_sample": True, + "temperature": 0.01, # TGI does not accept 0.0 temperature. + "top_p": 0.99, # TGI does not accept 1.0 top_p. + # TGI does not accept ignore_eos flag. + } + payload = { + "inputs": request_func_input.prompt, + "parameters": params, + } + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + chunk_bytes = chunk_bytes.decode("utf-8") + + #NOTE: Sometimes TGI returns a ping response without + # any data, we should skip it. + if chunk_bytes.startswith(":"): + continue + chunk = chunk_bytes.removeprefix("data:") + + data = json.loads(chunk) + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + + output.latency = most_recent_timestamp - st + output.success = True + output.generated_text = data["generated_text"] + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_trt_llm( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith("generate_stream") + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert request_func_input.best_of == 1 + payload = { + "accumulate_tokens": True, + "text_input": request_func_input.prompt, + "temperature": 0.0, + "top_p": 1.0, + "max_tokens": request_func_input.output_len, + "stream": True, + } + if request_func_input.ignore_eos: + payload["min_length"] = request_func_input.output_len + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data:") + + data = json.loads(chunk) + output.generated_text += data["text_output"] + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + + output.latency = most_recent_timestamp - st + output.success = True + + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_deepspeed_mii( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert request_func_input.best_of == 1 + + payload = { + "prompt": request_func_input.prompt, + "max_tokens": request_func_input.output_len, + "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. + "top_p": 1.0, + } + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024, + # will use 0 as placeholder. + # See https://github.com/microsoft/DeepSpeed-MII/pull/311 + output.ttft = 0 + + st = time.perf_counter() + try: + async with session.post(url=request_func_input.api_url, + json=payload) as response: + if response.status == 200: + parsed_resp = await response.json() + output.latency = time.perf_counter() - st + output.generated_text = parsed_resp["text"][0] + output.success = True + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + ("completions", "profile") + ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + payload = { + "model": request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "best_of": request_func_input.best_of, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "ignore_eos": request_func_input.ignore_eos, + } + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + first_chunk_received = False + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if data["choices"][0]["text"]: + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += data["choices"][0]["text"] + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!") + output.generated_text = generated_text + output.latency = latency + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + "chat/completions" + ), "OpenAI Chat Completions API URL must end with 'chat/completions'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + if request_func_input.multi_modal_content: + content.append(request_func_input.multi_modal_content) + payload = { + "model": request_func_input.model, + "messages": [ + { + "role": "user", + "content": content + }, + ], + "temperature": 0.0, + "max_completion_tokens": request_func_input.output_len, + "stream": True, + "ignore_eos": request_func_input.ignore_eos, + } + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + timestamp = time.perf_counter() + data = json.loads(chunk) + + delta = data["choices"][0]["delta"] + if delta.get("content", None): + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += delta["content"] + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +def get_model(pretrained_model_name_or_path: str) -> str: + if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': + from modelscope import snapshot_download + + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + + return model_path + return pretrained_model_name_or_path + + +def get_tokenizer( + pretrained_model_name_or_path: str, trust_remote_code: bool +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path is not None and not os.path.exists( + pretrained_model_name_or_path): + pretrained_model_name_or_path = get_model( + pretrained_model_name_or_path) + return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + + +ASYNC_REQUEST_FUNCS = { + "tgi": async_request_tgi, + "vllm": async_request_openai_completions, + "lmdeploy": async_request_openai_completions, + "deepspeed-mii": async_request_deepspeed_mii, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "tensorrt-llm": async_request_trt_llm, + "scalellm": async_request_openai_completions, + "sglang": async_request_openai_completions, +} diff --git a/vllm-v0.6.2/benchmarks/benchmark_latency.py b/vllm-v0.6.2/benchmarks/benchmark_latency.py new file mode 100644 index 0000000..8300fdf --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_latency.py @@ -0,0 +1,217 @@ +"""Benchmark the latency of processing a single batch of requests.""" +import argparse +import dataclasses +import json +import time +from pathlib import Path +from typing import List, Optional +import math +import os +os.environ['CN_NOTIFIER_POOL_MAX'] = "1000" + +import numpy as np +import torch +from tqdm import tqdm +from common import init_logger +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.utils import FlexibleArgumentParser +from vllm_mlu._mlu_utils import USE_PAGED + +logger = init_logger(__name__) + +def main(args: argparse.Namespace): + print(args) + + engine_args = EngineArgs.from_cli_args(args) + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + engine_args_dict_org = dataclasses.asdict(engine_args) + engine_args_dict = { + **engine_args_dict_org, + **{ + k: v + for k, v in engine_args.__dict__.items() if k not in engine_args_dict_org + } + } + + llm = LLM(**engine_args_dict, + enable_context_mlugraph=True, + context_batch_size_to_capture=args.batch_size, + context_seq_len_to_capture=args.input_len) + + num_gpu_block = llm.llm_engine.cache_config.num_gpu_blocks + block_size = llm.llm_engine.cache_config.block_size + max_num_batched_tokens = llm.llm_engine.scheduler_config.max_num_batched_tokens + batched_input_tokens = args.input_len * args.batch_size + batched_tokens_align = math.ceil((args.input_len + args.output_len) / \ + block_size) * block_size * args.batch_size + if not args.enable_chunked_prefill : + if max_num_batched_tokens < batched_input_tokens : + logger.error(f"The batch({args.batch_size}) * input length({args.input_len}) =" + f" ({batched_input_tokens}) is larger than " + f" max_num_batched_tokens({max_num_batched_tokens})") + logger.info(f"Try --max-num-batched-tokens ({batched_input_tokens})") + return + elif num_gpu_block * block_size < batched_tokens_align : + logger.error(f"Ceil of batch({args.batch_size}) * (input length" + f" ({args.input_len}) + output length({args.output_len})) =" + f" ({batched_tokens_align}) is larger than" + f" mlu blocks({num_gpu_block}) * block_size({block_size}) =" + f" ({num_gpu_block * block_size}) can hold max tokens.") + if not USE_PAGED : + logger.info("Try reduce block_size to make mlu blocks greater than batch," + " or try increase -tp to get more mlu blocks.") + else : + logger.info("Try increase -tp to get more mlu blocks.") + return + # Generate a warning if the sum of the input length and output length + # is less than the maximum model length, as only the first + # `max_model_len` will be processed. + max_length = args.input_len + args.output_len + max_model_len = llm.llm_engine.model_config.max_model_len + if max_length > max_model_len: + logger.warning( + f"The sum of input length({args.input_len}) and output" + f" length({args.output_len}) is larger than max model" + f" length({max_model_len})") + + sampling_params = SamplingParams( + n=args.n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=args.output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_prompts: List[PromptType] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] + + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir))) as p: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + print(p.key_averages()) + else: + start_time = time.perf_counter() + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) + + if args.profile: + profile_dir = args.profile_result_dir + if not profile_dir: + profile_dir = Path( + "." + ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + if args.show_per_iter: + llm.get_metrics(args.num_iters_warmup, + args.only_average, + args.input_len, + args.output_len, + args.tensor_parallel_size, + args.quantization, + llm.dump_info, + show_per_iter=args.show_per_iter) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f'Avg latency: {np.mean(latencies)} seconds') + for percentage, percentile in zip(percentages, percentiles): + print(f'{percentage}% percentile latency: {percentile} seconds') + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + + llm.get_metrics(args.num_iters_warmup, + args.only_average, + args.input_len, + args.output_len, + args.tensor_parallel_size, + args.quantization, + llm.dump_info) + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('--input-len', type=int, default=32) + parser.add_argument('--output-len', type=int, default=128) + parser.add_argument('--batch-size', type=int, default=8) + parser.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt.') + parser.add_argument('--use-beam-search', action='store_true') + parser.add_argument('--num-iters-warmup', + type=int, + default=10, + help='Number of iterations to run for warmup.') + parser.add_argument('--num-iters', + type=int, + default=30, + help='Number of iterations to run.') + parser.add_argument( + '--profile', + action='store_true', + help='profile the generation process of a single batch') + parser.add_argument( + '--profile-result-dir', + type=str, + default=None, + help=('path to save the pytorch profiler output. Can be visualized ' + 'with ui.perfetto.dev or Tensorboard.')) + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the latency results in JSON format.') + parser.add_argument('--only-average', + action='store_true', + default=False, + help=( + 'Show all iteration metrics or average metrics.' + )) + parser.add_argument("--show-per-iter", + action='store_true', + help='If true, show metrics data per iteration.') + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/benchmark_latency_multiple.py b/vllm-v0.6.2/benchmarks/benchmark_latency_multiple.py new file mode 100644 index 0000000..ca834ae --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_latency_multiple.py @@ -0,0 +1,266 @@ +"""Benchmark the latency of processing a single batch of requests.""" +import argparse +import dataclasses +import json +import time +from pathlib import Path +from typing import List, Optional +import math +import os +os.environ['CN_NOTIFIER_POOL_MAX'] = "1000" + +import numpy as np +import torch +from tqdm import tqdm +from common import init_logger +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.utils import FlexibleArgumentParser +from vllm_mlu._mlu_utils import USE_PAGED, VLLM_DUMP_MLU_INFO_EN +from vllm_mlu.dump_info import LLMDumpInfo + +logger = init_logger(__name__) + +def main(args: argparse.Namespace): + print(args) + + # Only support input case list + assert len(args.input_case_list) > 0, "Only support input case list." + + new_case_list = [] + max_model_len = 0 + max_num_batched_tokens = 0 + for case in args.input_case_list: + case_info = case.split(",") + assert len(case_info) == 3 + batch_size, input_len, output_len = [int(v) for v in case_info] + new_case_list.append((batch_size, input_len, output_len)) + + cur_max_model_len = input_len + output_len + if cur_max_model_len > max_model_len: + max_model_len = cur_max_model_len + cur_max_num_batched_tokens = batch_size * input_len + if cur_max_num_batched_tokens > max_num_batched_tokens: + max_num_batched_tokens = cur_max_num_batched_tokens + + if max_num_batched_tokens < max_model_len: + max_num_batched_tokens = max_model_len + + args.max_model_len = max_model_len + args.max_num_batched_tokens = max_num_batched_tokens + args.max_seq_len_to_capture = max_model_len + if not USE_PAGED: + args.block_size = max_model_len + logger.warning(f"For unpaged mode, we must choose the max-scale to set block_size, " + + f"which may decreases the concurrency of small-scale.") + + engine_args = EngineArgs.from_cli_args(args) + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM(**dataclasses.asdict(engine_args), + enable_context_mlugraph=True, + context_batch_size_to_capture=new_case_list[0][0], + context_seq_len_to_capture=new_case_list[0][1]) + + if VLLM_DUMP_MLU_INFO_EN: + LLM.dump_info.dev_info.should_stop = True + + for batch_size, input_len, output_len in new_case_list: + + print("\n" + f"#" * 60 + "\n" + \ + f"# Benchmark: batch_size={batch_size}, input_len={input_len}, output_len={output_len} #\n" + \ + f"#" * 60 + "\n") + + # Re-Start dump info + LLM.dump_info = LLMDumpInfo() + LLM.dump_info.init_param( + tensor_parallel_size=args.tensor_parallel_size, dtype=args.dtype, + kv_cache_dtype=args.kv_cache_dtype, + quantization=args.quantization, + model=args.model, trust_remote_code=args.trust_remote_code + ) + LLM.dump_info.memory_usage() + + # Reset metrics + llm.metric.reset_metric() + + # Re-capture model for context and decoder mlugraph + llm.llm_engine.model_executor.recapture_model(batch_size, input_len) + + # Run current case + num_gpu_block = llm.llm_engine.cache_config.num_gpu_blocks + block_size = llm.llm_engine.cache_config.block_size + max_num_batched_tokens = llm.llm_engine.scheduler_config.max_num_batched_tokens + batched_input_tokens = input_len * batch_size + batched_tokens_align = math.ceil((input_len + output_len) / \ + block_size) * block_size * batch_size + if not args.enable_chunked_prefill : + if max_num_batched_tokens < batched_input_tokens : + logger.error(f"The batch({batch_size}) * input length({input_len}) =" + f" ({batched_input_tokens}) is larger than " + f" max_num_batched_tokens({max_num_batched_tokens})") + logger.info(f"Try --max-num-batched-tokens ({batched_input_tokens})") + return + elif num_gpu_block * block_size < batched_tokens_align : + logger.error(f"Ceil of batch({batch_size}) * (input length" + f" ({input_len}) + output length({output_len})) =" + f" ({batched_tokens_align}) is larger than" + f" mlu blocks({num_gpu_block}) * block_size({block_size}) =" + f" ({num_gpu_block * block_size}) can hold max tokens.") + if not USE_PAGED : + logger.info("Try reduce block_size to make mlu blocks greater than batch," + " or try increase -tp to get more mlu blocks.") + else : + logger.info("Try increase -tp to get more mlu blocks.") + return + # Generate a warning if the sum of the input length and output length + # is less than the maximum model length, as only the first + # `max_model_len` will be processed. + max_length = input_len + output_len + max_model_len = llm.llm_engine.model_config.max_model_len + if max_length > max_model_len: + logger.warning( + f"The sum of input length({input_len}) and output" + f" length({output_len}) is larger than max model" + f" length({max_model_len})") + + sampling_params = SamplingParams( + n=args.n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint(10000, + size=(batch_size, + input_len)) + dummy_prompts: List[PromptType] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] + + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.MLU, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir))) as p: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + print(p.key_averages()) + else: + start_time = time.perf_counter() + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) + + if args.profile: + profile_dir = args.profile_result_dir + if not profile_dir: + profile_dir = Path( + "." + ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + if args.show_per_iter: + llm.get_metrics(args.num_iters_warmup, + args.only_average, + input_len, + output_len, + args.tensor_parallel_size, + args.quantization, + llm.dump_info, + show_per_iter=args.show_per_iter) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f'Avg latency: {np.mean(latencies)} seconds') + for percentage, percentile in zip(percentages, percentiles): + print(f'{percentage}% percentile latency: {percentile} seconds') + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + + llm.get_metrics(args.num_iters_warmup, + args.only_average, + input_len, + output_len, + args.tensor_parallel_size, + args.quantization, + llm.dump_info) + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('--input-case-list', + nargs='+', + default=['8,32,128'], + help="The case list with format [(batch, input_len, output_len), ...].") + parser.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt.') + parser.add_argument('--use-beam-search', action='store_true') + parser.add_argument('--num-iters-warmup', + type=int, + default=10, + help='Number of iterations to run for warmup.') + parser.add_argument('--num-iters', + type=int, + default=30, + help='Number of iterations to run.') + parser.add_argument( + '--profile', + action='store_true', + help='profile the generation process of a single batch') + parser.add_argument( + '--profile-result-dir', + type=str, + default=None, + help=('path to save the pytorch profiler output. Can be visualized ' + 'with ui.perfetto.dev or Tensorboard.')) + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the latency results in JSON format.') + parser.add_argument('--only-average', + action='store_true', + default=False, + help=( + 'Show all iteration metrics or average metrics.' + )) + parser.add_argument("--show-per-iter", + action='store_true', + help='If true, show metrics data per iteration.') + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/benchmark_prefix_caching.py b/vllm-v0.6.2/benchmarks/benchmark_prefix_caching.py new file mode 100644 index 0000000..6d33096 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_prefix_caching.py @@ -0,0 +1,181 @@ +""" +Benchmark the efficiency of prefix caching. + +This script allows you to benchmark the performance of +a model with and without prefix caching using either fixed prompts +or prompts sampled from the ShareGPT dataset. + +Fixed example usage: + python benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-prompts 1 \ + --repeat-count 100 + +ShareGPT example usage: + # This command samples 20 prompts with input lengths + # between 128 and 256 tokens from the ShareGPT dataset, + # then replicates each prompt 5 times. + python benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \ + --enable-prefix-caching \ + --num-prompts 20 \ + --repeat-count 5 \ + --input-length-range 128:256 +""" + +import dataclasses +import json +import random +import time +from typing import List, Optional, Tuple + +from transformers import PreTrainedTokenizerBase + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 + + +def test_prefix(llm=None, sampling_params=None, prompts=None): + start_time = time.time() + + llm.generate(prompts, sampling_params=sampling_params) + + end_time = time.time() + print(f"cost time {end_time - start_time}") + + +def sample_requests( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + input_length_range: Tuple[int, int], + fixed_output_len: Optional[int], +) -> List[Tuple[str, int, int]]: + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] + + # Shuffle the dataset. + random.shuffle(dataset) + + min_len, max_len = input_length_range + + # Filter out sequences that are too long or too short + filtered_dataset: List[Tuple[str, int, int]] = [] + for i in range(len(dataset)): + if len(filtered_dataset) == num_requests: + break + + # Tokenize the prompts and completions. + prompt = dataset[i][0] + prompt_token_ids = tokenizer(prompt).input_ids + completion = dataset[i][1] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = len(completion_token_ids + ) if fixed_output_len is None else fixed_output_len + if prompt_len < 4 or output_len < 4: + # Prune too short sequences. + continue + if min_len <= prompt_len <= max_len: + filtered_dataset.append((prompt, prompt_len, output_len)) + + return filtered_dataset + + +def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], + repeat_count: int, + sort: bool = False) -> List[str]: + repeated_requests = requests * repeat_count + if sort: + repeated_requests.sort(key=lambda x: x[1]) + else: + random.shuffle(repeated_requests) + return [req[0] for req in repeated_requests] + + +def main(args): + tokenizer = get_tokenizer(args.model, trust_remote_code=True) + input_length_range = tuple(map(int, args.input_length_range.split(':'))) + random.seed(args.seed) + if args.dataset_path is not None: + print(f"Start to sample {args.num_prompts} prompts" + f"from {args.dataset_path}") + filtered_datasets = sample_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + input_length_range=input_length_range, + fixed_output_len=args.output_len, + ) + else: + prompt_len = len(tokenizer(PROMPT).input_ids) + filtered_datasets = [(PROMPT, prompt_len, args.output_len) + ] * args.num_prompts + + engine_args = EngineArgs.from_cli_args(args) + + llm = LLM(**dataclasses.asdict(engine_args)) + + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + + print("Testing filtered datasets") + prompts = repeat_and_sort_requests(filtered_datasets, + repeat_count=args.repeat_count, + sort=args.sort) + + print("------start generating------") + test_prefix( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description= + 'Benchmark the performance with or without automatic prefix caching.') + parser.add_argument("--dataset-path", + type=str, + default=None, + help="Path to the dataset.") + parser.add_argument('--output-len', type=int, default=10) + parser.add_argument('--num-prompts', + type=int, + default=1, + help="Number of the prompts sampled from dataset") + parser.add_argument('--repeat-count', + type=int, + default=100, + help='Number of times to repeat each prompt') + parser.add_argument('--sort', + action='store_true', + help='Sort prompts by input length') + parser.add_argument('--input-length-range', + type=str, + default='128:256', + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/benchmark_prioritization.py b/vllm-v0.6.2/benchmarks/benchmark_prioritization.py new file mode 100644 index 0000000..e0c9e6a --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_prioritization.py @@ -0,0 +1,177 @@ +"""Benchmark offline prioritization.""" +import argparse +import dataclasses +import json +import random +import time +from typing import List, Optional, Tuple + +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def sample_requests( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int], +) -> List[Tuple[str, int, int]]: + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] + + # Shuffle the dataset. + random.shuffle(dataset) + + # Filter out sequences that are too long or too short + filtered_dataset: List[Tuple[str, int, int]] = [] + for i in range(len(dataset)): + if len(filtered_dataset) == num_requests: + break + + # Tokenize the prompts and completions. + prompt = dataset[i][0] + prompt_token_ids = tokenizer(prompt).input_ids + completion = dataset[i][1] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = len(completion_token_ids + ) if fixed_output_len is None else fixed_output_len + if prompt_len < 4 or output_len < 4: + # Prune too short sequences. + continue + if prompt_len > 1024 or prompt_len + output_len > 2048: + # Prune too long sequences. + continue + + #Select a equi-probable random priority + priority = 0 if random.random() < 0.5 else 1 + + filtered_dataset.append((prompt, prompt_len, output_len, priority)) + + return filtered_dataset + + +def run_vllm( + requests: List[Tuple[str, int, int]], + n: int, + engine_args: EngineArgs, +) -> float: + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) + + # Add the requests to the engine. + prompts = [] + sampling_params = [] + priority = [] + for prompt, _, output_len, _priority in requests: + prompts.append(prompt) + priority.append(_priority) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=output_len, + )) + + start = time.perf_counter() + llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True) + end = time.perf_counter() + return end - start + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + if args.dataset is None: + # Synthesize a prompt with the given input length. + prompt = "hi" * (args.input_len - 1) + requests = [(prompt, args.input_len, args.output_len) + for _ in range(args.num_prompts)] + else: + requests = sample_requests(args.dataset, args.num_prompts, tokenizer, + args.output_len) + + if args.backend == "vllm": + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) + else: + raise ValueError(f"Unknown backend: {args.backend}") + total_num_tokens = sum(prompt_len + output_len + for _, prompt_len, output_len, priority in requests) + print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} tokens/s") + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--backend", + type=str, + choices=["vllm", "hf", "mii"], + default="vllm") + parser.add_argument("--dataset", + type=str, + default=None, + help="Path to the dataset.") + parser.add_argument("--input-len", + type=int, + default=None, + help="Input prompt length for each request") + parser.add_argument("--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=200, + help="Number of prompts to process.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + if args.dataset is None: + assert args.input_len is not None + assert args.output_len is not None + else: + assert args.input_len is None + + main(args) diff --git a/vllm-v0.6.2/benchmarks/benchmark_serving.py b/vllm-v0.6.2/benchmarks/benchmark_serving.py new file mode 100644 index 0000000..bdb8ea8 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_serving.py @@ -0,0 +1,1136 @@ +r"""Benchmark online serving throughput. + +On the server side, run one of the following commands: + vLLM OpenAI API server + vllm serve \ + --swap-space 16 \ + --disable-log-requests + + (TGI backend) + ./launch_tgi_server.sh + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset-name sharegpt \ + --dataset-path \ + --request-rate \ # By default is inf + --num-prompts # By default is 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" +import argparse +import asyncio +import base64 +import io +import json +import os +import random +import time +import warnings +from dataclasses import dataclass +from datetime import datetime +from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple + +import numpy as np +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) +from datasets import load_dataset +from PIL.Image import Image +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: List[Tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] + + +def sample_sharegpt_requests( + dataset_path: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int] = None, +) -> List[Tuple[str, int, int, None]]: + # Load the dataset. + with open(dataset_path, encoding='utf-8') as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] + + # Shuffle the dataset. + random.shuffle(dataset) + + # Filter out sequences that are too long or too short + filtered_dataset: List[Tuple[str, int, int]] = [] + for i in range(len(dataset)): + if len(filtered_dataset) == num_requests: + break + + # Tokenize the prompts and completions. + prompt = dataset[i][0] + prompt_token_ids = tokenizer(prompt).input_ids + completion = dataset[i][1] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = len(completion_token_ids + ) if fixed_output_len is None else fixed_output_len + if prompt_len < 4 or (fixed_output_len is None and output_len < 4): + # Prune too short sequences. + continue + if prompt_len > 1024 or prompt_len + output_len > 2048: + # Prune too long sequences. + continue + filtered_dataset.append((prompt, prompt_len, output_len, None)) + + return filtered_dataset + + +def sample_sonnet_requests( + dataset_path: str, + num_requests: int, + input_len: int, + output_len: int, + prefix_len: int, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, str, int, int, None]]: + assert ( + input_len > prefix_len + ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." + + # Load the dataset. + with open(dataset_path, encoding='utf-8') as f: + poem_lines = f.readlines() + + # Tokenize the poem lines. + poem_token_ids = tokenizer(poem_lines).input_ids + average_poem_len = sum( + len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids) + + # Base prefix for all requests. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_message = [{ + "role": "user", + "content": base_prompt, + }] + base_prompt_formatted = tokenizer.apply_chat_template( + base_message, add_generation_prompt=True, tokenize=False) + base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids) + + assert ( + input_len > base_prompt_offset + ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}." + num_input_lines = round( + (input_len - base_prompt_offset) / average_poem_len) + + # First approximately `prefix_len` number of tokens in the + # prompt are fixed poem lines. + assert ( + prefix_len > base_prompt_offset + ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}." + + num_prefix_lines = round( + (prefix_len - base_prompt_offset) / average_poem_len) + prefix_lines = poem_lines[:num_prefix_lines] + + # Sample the rest of lines per request. + sampled_requests: List[Tuple[str, int, int]] = [] + for _ in range(num_requests): + num_lines_needed = num_input_lines - num_prefix_lines + sampled_lines = "".join(prefix_lines + + random.choices(poem_lines, k=num_lines_needed)) + + prompt = f"{base_prompt}{sampled_lines}" + message = [ + { + "role": "user", + "content": prompt, + }, + ] + prompt_formatted = tokenizer.apply_chat_template( + message, add_generation_prompt=True, tokenize=False) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + sampled_requests.append( + (prompt, prompt_formatted, prompt_len, output_len, None)) + + return sampled_requests + + +def sample_hf_requests( + dataset_path: str, + dataset_subset: str, + dataset_split: str, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + random_seed: int, + fixed_output_len: Optional[int] = None, +) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + dataset = load_dataset(dataset_path, + name=dataset_subset, + split=dataset_split, + streaming=True) + assert "conversations" in dataset.features, ( + "HF Dataset must have 'conversations' column.") + filter_func = lambda x: len(x["conversations"]) >= 2 + filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) + sampled_requests: List[Tuple[str, int, int, Dict[str, + Collection[str]]]] = [] + for data in filtered_dataset: + if len(sampled_requests) == num_requests: + break + + # Tokenize the prompts and completions. + prompt = data["conversations"][0]["value"] + prompt_token_ids = tokenizer(prompt).input_ids + completion = data["conversations"][1]["value"] + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = len(completion_token_ids + ) if fixed_output_len is None else fixed_output_len + if fixed_output_len is None and (prompt_len < 4 or output_len < 4): + # Prune too short sequences. + continue + if fixed_output_len is None and \ + (prompt_len > 1024 or prompt_len + output_len > 2048): + # Prune too long sequences. + continue + + if "image" in data and isinstance(data["image"], Image): + image: Image = data["image"] + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode( + image_data.getvalue()).decode("utf-8") + mm_content = { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + else: + mm_content = None + + sampled_requests.append((prompt, prompt_len, output_len, mm_content)) + + return sampled_requests + + +def sample_random_requests( + prefix_len: int, + input_len: int, + output_len: int, + num_prompts: int, + range_ratio: float, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, int, int]]: + prefix_token_ids = np.random.randint(0, + tokenizer.vocab_size, + size=prefix_len).tolist() + + input_lens = np.random.randint( + int(input_len * range_ratio), + input_len + 1, + size=num_prompts, + ) + output_lens = np.random.randint( + int(output_len * range_ratio), + output_len + 1, + size=num_prompts, + ) + offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) + input_requests = [] + for i in range(num_prompts): + prompt = tokenizer.decode(prefix_token_ids + + [(offsets[i] + i + j) % tokenizer.vocab_size + for j in range(input_lens[i])]) + + input_requests.append((prompt, int(prefix_len + input_lens[i]), + int(output_lens[i]), None)) + + return input_requests + + +async def get_request( + input_requests: List[Tuple[str, int, int]], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[Tuple[str, int, int], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + + for request in input_requests: + yield request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], + gootput_config_dict: Dict[str, float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + all_tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i][1] + tpot = 0 + if output_len > 1: + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if gootput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in gootput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(gootput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in gootput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(gootput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in gootput_config_dict: + valid_metrics.append(e2els) + slo_values.append(gootput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: List[Tuple[str, int, int]], + logprobs: Optional[int], + best_of: int, + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: List[str], + selected_percentiles: List[str], + ignore_eos: bool, + gootput_config_dict: Dict[str, float], + max_concurrency: Optional[int], +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + print("Starting initial single prompt test run...") + test_prompt, test_prompt_len, test_output_len, test_mm_content = ( + input_requests[0]) + if backend != "openai-chat" and test_mm_content is not None: + # multi-modal benchmark is only available on OpenAI Chat backend. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' backend.") + test_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=api_url, + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + best_of=best_of, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput(model=model_id, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + best_of=best_of, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: List[asyncio.Task] = [] + async for request in get_request(input_requests, request_rate, burstiness): + prompt, prompt_len, output_len, mm_content = request + request_func_input = RequestFuncInput(model=model_id, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + best_of=best_of, + multi_modal_content=mm_content, + ignore_eos=ignore_eos) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) + outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + best_of=best_of, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + gootput_config_dict=gootput_config_dict, + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + if gootput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput:": + metrics.request_goodput if gootput_config_dict else None, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + return result + + +def check_goodput_args(args): + # Check and parse goodput arguments + gootput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + gootput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in gootput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return gootput_config_dict + + +def parse_goodput(slo_pairs): + gootput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + gootput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return gootput_config_dict + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer(tokenizer_id, + trust_remote_code=args.trust_remote_code) + + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next " + "release. Please use '--dataset-name' and " + "'--dataset-path' in the future runs.", + stacklevel=2) + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + + elif args.dataset_name == "sharegpt": + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + + elif args.dataset_name == "sonnet": + # Do not format the prompt, pass to message directly + if args.backend == "openai-chat": + input_requests = sample_sonnet_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + ) + input_requests = [(prompt, prompt_len, output_len, None) + for prompt, prompt_formatted, prompt_len, + output_len, _ in input_requests] + else: + assert ( + tokenizer.chat_template or tokenizer.default_chat_template + ), "Tokenizer/model must have chat template for sonnet dataset." + input_requests = sample_sonnet_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + ) + input_requests = [(prompt_formatted, prompt_len, output_len, None) + for prompt, prompt_formatted, prompt_len, + output_len, _ in input_requests] + + elif args.dataset_name == "hf": + input_requests = sample_hf_requests( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + num_requests=args.num_prompts, + tokenizer=tokenizer, + random_seed=args.seed, + fixed_output_len=args.hf_output_len, + ) + + elif args.dataset_name == "random": + input_requests = sample_random_requests( + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + num_prompts=args.num_prompts, + range_ratio=args.random_range_ratio, + tokenizer=tokenizer, + ) + + else: + raise ValueError(f"Unknown dataset: {args.dataset_name}") + + gootput_config_dict = check_goodput_args(args) + + benchmark_result = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + logprobs=args.logprobs, + best_of=args.best_of, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + ignore_eos=args.ignore_eos, + gootput_config_dict=gootput_config_dict, + max_concurrency=args.max_concurrency, + )) + + # Save config and results to json + if args.save_result: + result_json: Dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["backend"] = backend + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["best_of"] = args.best_of + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=") + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format." + ) + + # Traffic + result_json["request_rate"] = ( + args.request_rate if args.request_rate < float("inf") else "inf") + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + # Save to file + base_model_id = model_id.split("/")[-1] + max_concurrency_str = (f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None else "") + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + file_name = os.path.join(args.result_dir, file_name) + with open(file_name, "w", encoding='utf-8') as outfile: + json.dump(result_json, outfile) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in the " + "next release.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="sharegpt", + choices=["sharegpt", "sonnet", "random", "hf"], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument("--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--best-of", + type=int, + default=1, + help="Generates `best_of` sequences per prompt and " + "returns the best one.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=("Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed"), + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + + # group for dataset specific arguments + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.") + + random_group = parser.add_argument_group("random dataset options") + random_group.add_argument( + "--random-input-len", + type=int, + default=1024, + help= + "Number of input tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-output-len", + type=int, + default=128, + help= + "Number of output tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-range-ratio", + type=float, + default=1.0, + help="Range of sampled ratio of input/output length, " + "used only for random sampling.", + ) + random_group.add_argument( + "--random-prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before random " + " context. The length range of context in a random " + " request is [random-prefix-len, " + " random-prefix-len + random-prefix-len * random-range-ratio).") + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/benchmark_serving_concurrency.py b/vllm-v0.6.2/benchmarks/benchmark_serving_concurrency.py new file mode 100644 index 0000000..534f144 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_serving_concurrency.py @@ -0,0 +1,708 @@ +"""Benchmark online serving throughput. + +On the server side, run one of the following commands: + vLLM OpenAI API server + vllm serve \ + --swap-space 16 \ + --disable-log-requests + + (TGI backend) + ./launch_tgi_server.sh + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset-name sharegpt \ + --dataset-path \ + --request-rate \ # By default is inf + --num-prompts # By default is 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" +import argparse +import asyncio +import json +import os +import random +import time +import warnings +from dataclasses import dataclass +from datetime import datetime +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple + +import numpy as np +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +from concurrent_executor import (ConcurrentExecutor, MluRequestFuncOutput) +from benchmark_serving import (BenchmarkMetrics, + sample_sharegpt_requests, + sample_random_requests, + sample_sonnet_requests) + + +@dataclass +class MluBenchmarkMetrics(BenchmarkMetrics): + # time_in_queue: first_scheduled_time - arrival_time + mean_time_in_queue_ms: float + std_time_in_queue_ms: float + median_time_in_queue_ms: float + percentiles_time_in_queue_ms: List[Tuple[float, float]] + + # time_schedule: sum(all schedule step times) + mean_time_schedule_ms: float + std_time_schedule_ms: float + median_time_schedule_ms: float + percentiles_time_schedule_ms: List[Tuple[float, float]] + + # ttft: first_token_time - arrival_time + mean_time_ttft_ms: float + std_time_ttft_ms: float + median_time_ttft_ms: float + percentiles_time_ttft_ms: List[Tuple[float, float]] + + # e2e: finished_time - arrival_time + mean_time_e2e_ms: float + std_time_e2e_ms: float + median_time_e2e_ms: float + percentiles_time_e2e_ms: List[Tuple[float, float]] + + # tpot: (finished_time - first_token_time) / (output_len - 1) + mean_time_tpot_ms: float + std_time_tpot_ms: float + median_time_tpot_ms: float + percentiles_time_tpot_ms: List[Tuple[float, float]] + + prompt_tokens: int # server received total tokens + completion_tokens: int # all generated tokens in server + server_output_throughput: float # server output throughput + server_total_token_throughput: float # server total throughput + + +def calculate_metrics( + input_requests: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + time_in_queues: List[float] = [] + time_schedules: List[float] = [] + time_ttfts: List[float] = [] + time_e2es: List[float] = [] + time_tpots: List[float] = [] + prompt_tokens: List[int] = [] + completion_tokens: List[int] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i][1] + if output_len > 1: + tpots.append( + (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + # Collect metric from server + time_in_queues.append(outputs[i].metric["time_in_queue"]) + time_schedules.append(outputs[i].metric["scheduler_time"]) + time_ttfts.append(outputs[i].metric["first_token_time"] - outputs[i].metric["arrival_time"]) + time_e2es.append(outputs[i].metric["finished_time"] - outputs[i].metric["arrival_time"]) + if outputs[i].usage["completion_tokens"] > 1: + time_tpots.append( + (outputs[i].metric["finished_time"] - outputs[i].metric["first_token_time"]) / + (outputs[i].usage["completion_tokens"] - 1)) + prompt_tokens.append(outputs[i].usage["prompt_tokens"]) + completion_tokens.append(outputs[i].usage["completion_tokens"]) + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = MluBenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.median(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.mean(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + mean_time_in_queue_ms=np.mean(time_in_queues or 0) * 1000, + std_time_in_queue_ms=np.std(time_in_queues or 0) * 1000, + median_time_in_queue_ms=np.median(time_in_queues or 0) * 1000, + percentiles_time_in_queue_ms=[(p, np.percentile(time_in_queues or 0, p) * 1000) + for p in selected_percentiles], + mean_time_schedule_ms=np.mean(time_schedules or 0) * 1000, + std_time_schedule_ms=np.std(time_schedules or 0) * 1000, + median_time_schedule_ms=np.median(time_schedules or 0) * 1000, + percentiles_time_schedule_ms=[(p, np.percentile(time_schedules or 0, p) * 1000) + for p in selected_percentiles], + mean_time_ttft_ms=np.mean(time_ttfts or 0) * 1000, + std_time_ttft_ms=np.std(time_ttfts or 0) * 1000, + median_time_ttft_ms=np.median(time_ttfts or 0) * 1000, + percentiles_time_ttft_ms=[(p, np.percentile(time_ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_time_e2e_ms=np.mean(time_e2es or 0) * 1000, + std_time_e2e_ms=np.std(time_e2es or 0) * 1000, + median_time_e2e_ms=np.median(time_e2es or 0) * 1000, + percentiles_time_e2e_ms=[(p, np.percentile(time_e2es or 0, p) * 1000) + for p in selected_percentiles], + mean_time_tpot_ms=np.mean(time_tpots or 0) * 1000, + std_time_tpot_ms=np.std(time_tpots or 0) * 1000, + median_time_tpot_ms=np.median(time_tpots or 0) * 1000, + percentiles_time_tpot_ms=[(p, np.percentile(time_tpots or 0, p) * 1000) + for p in selected_percentiles], + prompt_tokens=sum(prompt_tokens), + completion_tokens=sum(completion_tokens), + server_output_throughput=sum(completion_tokens) / dur_s, + server_total_token_throughput=(sum(prompt_tokens) + sum(completion_tokens)) / dur_s, + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: List[Tuple[str, int, int]], + logprobs: Optional[int], + best_of: int, + use_beam_search: bool, + disable_tqdm: bool, + selected_percentile_metrics: List[str], + selected_percentiles: List[str], + concurrency_num: int, + ignore_eos: bool, +): + + assert backend == "vllm", f"Only support vllm backend at concurrent mode." + assert concurrency_num >= 1, f"The concurrency_num must greater than 0, but got {concurrency_num}." + + pbar = None if disable_tqdm else tqdm(total=len(input_requests), desc="Infer") + + # Run serving with concurrent mode, + # use 'concurrency' to control reqeust num + executor = ConcurrentExecutor(concurrency_num=concurrency_num, + input_requests=input_requests) + + # Config pyload + executor.config_pyload(model=model_id, + api_url=api_url, + logprobs=logprobs, + best_of=best_of, + use_beam_search=use_beam_search, + include_usage=True, + ignore_eos=ignore_eos) + + benchmark_start_time = time.perf_counter() + + # Execute with concurrent mode + outputs: List[MluRequestFuncOutput] = executor.run(pbar=pbar) + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=' Client Metrics ', n=50, c='#')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function print and add statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("{s:{c}^{n}}".format(s=' Server Metrics ', n=50, c='#')) + print("{:<40} {:<10}".format("Total input tokens:", + metrics.prompt_tokens)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.completion_tokens)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.server_output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.server_total_token_throughput)) + process_one_metric("time_in_queue", "IQL", "In-Queue Latency") + process_one_metric("time_schedule", "SL", "Schedule Latency") + process_one_metric("time_ttft", "STTFT", "Time to First Token") + process_one_metric("time_tpot", "STPOT", "Time per Output Token") + process_one_metric("time_e2e", "SE2EL", "End-to-end Latency") + + print("=" * 50) + + return result + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer(tokenizer_id, + trust_remote_code=args.trust_remote_code) + + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next " + "release. Please use '--dataset-name' and " + "'--dataset-path' in the future runs.", + stacklevel=2) + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + + elif args.dataset_name == "sharegpt": + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + + elif args.dataset_name == "sonnet": + # Do not format the prompt, pass to message directly + if args.backend == "openai-chat": + input_requests = sample_sonnet_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + ) + input_requests = [(prompt, prompt_len, output_len) + for prompt, prompt_formatted, prompt_len, + output_len in input_requests] + else: + assert ( + tokenizer.chat_template or tokenizer.default_chat_template + ), "Tokenizer/model must have chat template for sonnet dataset." + input_requests = sample_sonnet_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + ) + input_requests = [(prompt_formatted, prompt_len, output_len) + for prompt, prompt_formatted, prompt_len, + output_len in input_requests] + + elif args.dataset_name == "random": + input_requests = sample_random_requests( + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + num_prompts=args.num_prompts, + range_ratio=args.random_range_ratio, + tokenizer=tokenizer, + ) + + else: + raise ValueError(f"Unknown dataset: {args.dataset_name}") + + benchmark_result = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + logprobs=args.logprobs, + best_of=args.best_of, + use_beam_search=args.use_beam_search, + disable_tqdm=args.disable_tqdm, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + concurrency_num=args.concurrency_num, + ignore_eos=args.ignore_eos, + )) + + # Save config and results to json + if args.save_result: + result_json: Dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["backend"] = backend + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["best_of"] = args.best_of + result_json["use_beam_search"] = args.use_beam_search + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=") + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format." + ) + + # Traffic + result_json["request_rate"] = ( + args.request_rate if args.request_rate < float("inf") else "inf") + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + # Save to file + base_model_id = model_id.split("/")[-1] + file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + file_name = os.path.join(args.result_dir, file_name) + with open(file_name, "w") as outfile: + json.dump(result_json, outfile) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in the " + "next release.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="sharegpt", + choices=["sharegpt", "sonnet", "random"], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument("--dataset-path", + type=str, + default=None, + help="Path to the dataset.") + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--best-of", + type=int, + default=1, + help="Generates `best_of` sequences per prompt and " + "returns the best one.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.") + parser.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + parser.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=("Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed"), + ) + parser.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + parser.add_argument( + "--random-input-len", + type=int, + default=1024, + help= + "Number of input tokens per request, used only for random sampling.", + ) + parser.add_argument( + "--random-output-len", + type=int, + default=128, + help= + "Number of output tokens per request, used only for random sampling.", + ) + parser.add_argument( + "--random-range-ratio", + type=float, + default=1.0, + help="Range of sampled ratio of input/output length, " + "used only for random sampling.", + ) + parser.add_argument( + "--random-prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before random " + " context. The length range of context in a random " + " request is [random-prefix-len, " + " random-prefix-len + random-prefix-len * random-range-ratio).") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl,e2el,time_in_queue,time_schedule,time_ttft,time_e2e,time_tpot", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl,e2el\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + parser.add_argument( + "--concurrency-num", + type=int, + default=1, + help="Number of concurrency in client. If this is 1, " + "then 'request_rate' with be enable. " + "Otherwise, we run serving test with concurrent mode.", + ) + parser.add_argument("--ignore-eos", + action='store_true', + help='If true, vllm server with decode until reach max_output_len.') + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/benchmark_throughput.py b/vllm-v0.6.2/benchmarks/benchmark_throughput.py new file mode 100644 index 0000000..2a42178 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/benchmark_throughput.py @@ -0,0 +1,474 @@ +"""Benchmark offline inference throughput.""" +import argparse +import dataclasses +import json +import math +import random +import time +from typing import List, Optional, Tuple +import os +os.environ['CN_NOTIFIER_POOL_MAX'] = "1000" + +import torch +import uvloop +from PIL import Image +from tqdm import tqdm +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizerBase) + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) +from vllm.inputs import TextPrompt +from vllm.multimodal import MultiModalDataDict +from vllm.sampling_params import BeamSearchParams +from vllm.utils import FlexibleArgumentParser, merge_async_iterators +from common import init_logger + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[MultiModalDataDict] = None + + +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model + if fixed_output_len is not None and fixed_output_len < 4: + raise ValueError("output_len too small") + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Shuffle the dataset. + random.shuffle(dataset) + + # Filter out sequences that are too long or too short + filtered_dataset: List[SampleRequest] = [] + for data in dataset: + if len(filtered_dataset) == num_requests: + break + + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + + # Tokenize the prompts and completions. + prompt_token_ids = tokenizer(prompt).input_ids + completion_token_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_token_ids) + output_len = len(completion_token_ids + ) if fixed_output_len is None else fixed_output_len + if prompt_len < 4 or output_len < 4: + # Prune too short sequences. + continue + if prompt_len > 1024 or prompt_len + output_len > 2048: + # Prune too long sequences. + continue + filtered_dataset.append( + SampleRequest(prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) + + return filtered_dataset + + +def run_vllm( + requests: List[SampleRequest], + n: int, + engine_args: EngineArgs, +) -> float: + enable_context_mlugraph = False + context_batch_size_to_capture = None + context_seq_len_to_capture = None + if engine_args.max_num_batched_tokens is not None: + input_len = requests[0][1] + is_all_reqs_same_length = all(req[1] == input_len for req in requests) + if is_all_reqs_same_length: + logger.info(f"Prefill MLUGraph enable !") + enable_context_mlugraph = True + context_batch_size_to_capture = min( + math.floor(engine_args.max_num_batched_tokens / input_len), len(requests)) + context_seq_len_to_capture = input_len + + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args), + enable_context_mlugraph=enable_context_mlugraph, + context_batch_size_to_capture=context_batch_size_to_capture, + context_seq_len_to_capture=context_seq_len_to_capture) + + # Generate a warning if the maximum sum of the input length and output + # length is less than the maximum model length, as only the first + # `max_model_len` will be processed. + max_length = max((req.prompt_len + req.expected_output_len for req in requests), default=0) + max_model_len = llm.llm_engine.model_config.max_model_len + if max_length > max_model_len: + logger.warning( + f"The sum of input and output length({max_length}) is larger than" + f" max model length({max_model_len})") + + # Add the requests to the engine. + prompts: List[TextPrompt] = [] + sampling_params: List[SamplingParams] = [] + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + )) + + use_beam_search = False + + if not use_beam_search: + start = time.perf_counter() + llm.generate(prompts, sampling_params, use_tqdm=True) + end = time.perf_counter() + else: + prompts = [request.prompt for request in requests] + # output_len should be the same for all requests. + output_len = requests[0][2] + for request in requests: + assert request.expected_output_len == output_len + start = time.perf_counter() + llm.beam_search( + prompts, + BeamSearchParams( + beam_width=n, + max_tokens=output_len, + ignore_eos=True, + )) + end = time.perf_counter() + return end - start + + +async def run_vllm_async( + requests: List[SampleRequest], + n: int, + engine_args: AsyncEngineArgs, + disable_frontend_multiprocessing: bool = False, +) -> float: + from vllm import SamplingParams + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing) as llm: + + # Add the requests to the engine. + prompts: List[TextPrompt] = [] + sampling_params: List[SamplingParams] = [] + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + )) + + generators = [] + start = time.perf_counter() + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + end = time.perf_counter() + return end - start + + +def run_hf( + requests: List[SampleRequest], + model: str, + tokenizer: PreTrainedTokenizerBase, + n: int, + max_batch_size: int, + trust_remote_code: bool, +) -> float: + llm = AutoModelForCausalLM.from_pretrained( + model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token + llm = llm.cuda() + + pbar = tqdm(total=len(requests)) + start = time.perf_counter() + batch: List[str] = [] + max_prompt_len = 0 + max_output_len = 0 + for i in range(len(requests)): + prompt, prompt_len, output_len = requests[i] + # Add the prompt to the batch. + batch.append(prompt) + max_prompt_len = max(max_prompt_len, prompt_len) + max_output_len = max(max_output_len, output_len) + if len(batch) < max_batch_size and i != len(requests) - 1: + # Check if we can add more requests to the batch. + _, next_prompt_len, next_output_len = requests[i + 1] + if (max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len)) <= 2048: + # We can add more requests to the batch. + continue + + # Generate the sequences. + input_ids = tokenizer(batch, return_tensors="pt", + padding=True).input_ids + llm_outputs = llm.generate( + input_ids=input_ids.cuda(), + do_sample=True, + num_return_sequences=n, + temperature=1.0, + top_p=1.0, + use_cache=True, + max_new_tokens=max_output_len, + ) + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + pbar.update(len(batch)) + + # Clear the batch. + batch = [] + max_prompt_len = 0 + max_output_len = 0 + end = time.perf_counter() + return end - start + + +def run_mii( + requests: List[SampleRequest], + model: str, + tensor_parallel_size: int, + output_len: int, +) -> float: + from mii import client, serve + llm = serve(model, tensor_parallel=tensor_parallel_size) + prompts = [request.prompt for request in requests] + + start = time.perf_counter() + llm.generate(prompts, max_new_tokens=output_len) + end = time.perf_counter() + client = client(model) + client.terminate_server() + return end - start + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + if args.dataset is None: + # Synthesize a prompt with the given input length. + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for i in range(-10, 10): + prompt = "hi " * (args.input_len + i) + tokenized_prompt = tokenizer(prompt).input_ids + if len(tokenized_prompt) == args.input_len: + break + else: + raise ValueError( + f"Failed to synthesize a prompt with {args.input_len} tokens.") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len) + for _ in range(args.num_prompts) + ] + else: + requests = sample_requests(tokenizer, args) + + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) + if args.backend == "vllm": + if args.async_engine: + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + )) + else: + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) + elif args.backend == "hf": + assert args.tensor_parallel_size == 1 + elapsed_time = run_hf(requests, args.model, tokenizer, args.n, + args.hf_max_batch_size, args.trust_remote_code) + elif args.backend == "mii": + elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, + args.output_len) + else: + raise ValueError(f"Unknown backend: {args.backend}") + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if is_multi_modal: + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. + print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--backend", + type=str, + choices=["vllm", "hf", "mii"], + default="vllm") + parser.add_argument("--dataset", + type=str, + default=None, + help="Path to the dataset. The dataset is expected to " + "be a json in form of List[Dict[..., conversations: " + "List[Dict[..., value: ]]]]") + parser.add_argument("--input-len", + type=int, + default=None, + help="Input prompt length for each request") + parser.add_argument("--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.") + parser.add_argument("--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") + parser = AsyncEngineArgs.add_cli_args(parser) + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + if args.dataset is None: + assert args.input_len is not None + assert args.output_len is not None + else: + assert args.input_len is None + + if args.backend == "vllm": + if args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + elif args.backend == "hf": + if args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend.") + if args.quantization is not None: + raise ValueError("Quantization is only for vLLM backend.") + elif args.backend == "mii": + if args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.quantization is not None: + raise ValueError("Quantization is only for vLLM backend.") + if args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + if args.tokenizer != args.model: + raise ValueError("Tokenizer must be the same as the model for MII " + "backend.") + main(args) diff --git a/vllm-v0.6.2/benchmarks/common.py b/vllm-v0.6.2/benchmarks/common.py new file mode 100644 index 0000000..d47ad40 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/common.py @@ -0,0 +1,17 @@ +import logging +from logging import Logger + +def init_logger(name: str) -> Logger: + """Initialize loggers for benchmarks module, + and keep the configuration consistent with the vllm module""" + + logger = logging.getLogger(name) + + vllm_logger = logging.Logger.manager.loggerDict.get('vllm', None) + if vllm_logger: + logger.setLevel(vllm_logger.level) + logger.propagate = vllm_logger.propagate + logger.handlers = vllm_logger.handlers + + return logger + diff --git a/vllm-v0.6.2/benchmarks/concurrent_executor.py b/vllm-v0.6.2/benchmarks/concurrent_executor.py new file mode 100644 index 0000000..83601a3 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/concurrent_executor.py @@ -0,0 +1,149 @@ +import json +import os +import sys +import time +import traceback +from dataclasses import dataclass, field +from typing import Optional, List + +from tqdm.asyncio import tqdm + +import requests +import concurrent + +from backend_request_func import (RequestFuncInput, RequestFuncOutput, remove_prefix) + +@dataclass +class MluRequestFuncInput(RequestFuncInput): + include_usage: bool = False + ignore_eos: bool = False + + +@dataclass +class MluRequestFuncOutput(RequestFuncOutput): + usage: dict = field( + default_factory=dict) + metric: dict = field( + default_factory=dict) + + +def sync_request_openai_completions( + request_func_input: MluRequestFuncInput, + pbar: Optional[tqdm] = None, +) -> MluRequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + ("completions", "profile") + ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + + assert not request_func_input.use_beam_search + payload = { + "model": request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "best_of": request_func_input.best_of, + "max_tokens": request_func_input.output_len, + "ignore_eos": request_func_input.ignore_eos, + "logprobs": request_func_input.logprobs, + "stream": True, + "stream_options": {"include_usage": request_func_input.include_usage} + } + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + + output = MluRequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + with requests.post(url=api_url, json=payload, headers=headers, stream=True) as response: + response.raise_for_status() + for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\n"): + if chunk: + chunk = remove_prefix(chunk.decode("utf-8"), "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if "choices" in data and len(data["choices"]) > 0 and data["choices"][0]["text"]: + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += data["choices"][0]["text"] + + if "usage" in data and data["usage"] is not None: + output.usage = data["usage"] + + if "metric" in data and data["metric"] is not None: + output.metric = data["metric"] + + output.generated_text = generated_text + output.success = True + output.latency = latency + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +class ConcurrentExecutor: + + def __init__(self, concurrency_num, input_requests) -> None: + self.concurrency_num = concurrency_num + self.concurrency_tasks = [] + self.input_requests_iter = iter(input_requests) + self.total_requests = len(input_requests) + self.send_requests = 0 + self.recv_requests = 0 + self.request_input_kwargs = {} + + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency_num) + + def config_pyload(self, **kwargs): + self.request_input_kwargs.update(**kwargs) + + def run(self, pbar): + request_results = [] + + while self.recv_requests < self.total_requests: + if len(self.concurrency_tasks) < self.concurrency_num and self.send_requests < self.total_requests: + prompt, prompt_len, output_len = next(self.input_requests_iter) + self.request_input_kwargs['prompt'] = prompt + self.request_input_kwargs['prompt_len'] = prompt_len + self.request_input_kwargs['output_len'] = output_len + request_func_input = MluRequestFuncInput(**self.request_input_kwargs) + + self.concurrency_tasks.append( + self.executor.submit(sync_request_openai_completions, request_func_input, pbar) + ) + self.send_requests += 1 + else: + done, pending = concurrent.futures.wait(self.concurrency_tasks, return_when="FIRST_COMPLETED") + self.recv_requests += len(done) + for task in done: + assert task.done() + request_results.append(task.result()) + self.concurrency_tasks = list(pending) + + return request_results \ No newline at end of file diff --git a/vllm-v0.6.2/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/vllm-v0.6.2/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py new file mode 100644 index 0000000..63cf5d5 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -0,0 +1,389 @@ +import argparse +import copy +import itertools +import pickle as pkl +import time +from typing import Callable, Iterable, List, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + +# helpers + + +def to_fp8(tensor: torch.Tensor) -> torch.Tensor: + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def to_int8(tensor: torch.Tensor) -> torch.Tensor: + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def make_rand_tensors(dtype: torch.dtype, m: int, n: int, + k: int) -> Tuple[torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device='cuda') * 5 + b = torch.randn((n, k), device='cuda').t() * 5 + + if dtype == torch.int8: + return to_int8(a), to_int8(b) + if dtype == torch.float8_e4m3fn: + return to_fp8(a), to_fp8(b) + + raise ValueError("unsupported dtype") + + +# bench +def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, + **kwargs) -> TMeasurement: + min_run_time = 1 + + globals = { + "args": args, + "kwargs": kwargs, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(*args, **kwargs)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + assert dtype == torch.int8 + a, b = make_rand_tensors(torch.int8, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + azp = torch.zeros((m, ), device="cuda", dtype=torch.int32) + azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32) + + timers = [] + # pytorch impl - bfloat16 + timers.append( + bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, a.to(dtype=torch.bfloat16), + b.to(dtype=torch.bfloat16))) + + # pytorch impl - float16 + timers.append( + bench_fn(label, sub_label, + "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm, + a.to(dtype=torch.float16), b.to(dtype=torch.float16))) + + # cutlass impl + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, + torch.bfloat16)) + + # cutlass with bias + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, + bias)) + + # cutlass with azp per-tensor + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj)) + + # cutlass with azp per-tensor + bias + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj, None, bias)) + + # cutlass with azp per-token + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj, azp)) + + # cutlass with azp per-token + bias + timers.append( + bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias", + ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, + torch.bfloat16, azp_adj, azp, bias)) + + return timers + + +def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + assert dtype == torch.float8_e4m3fn + a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + + timers = [] + + # pytorch impl w. bf16 + timers.append( + bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"))) + + # pytorch impl: bf16 output, without fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16)) + + # pytorch impl: bf16 output, with fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True)) + + # pytorch impl: fp16 output, without fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16)) + + # pytorch impl: fp16 output, with fp8 fast accum + timers.append( + bench_fn(label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16, + use_fast_accum=True)) + + # cutlass impl: bf16 output + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, + torch.bfloat16)) + # cutlass impl: fp16 output + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16)) + + # cutlass impl: bf16 output, with bias + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, + bias)) + + # cutlass impl: fp16 output, with bias + timers.append( + bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias", + ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16, + bias.to(dtype=torch.float16))) + + return timers + + +def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + if dtype == torch.int8: + return bench_int8(dtype, m, k, n, label, sub_label) + if dtype == torch.float8_e4m3fn: + return bench_fp8(dtype, m, k, n, label, sub_label) + raise ValueError("unsupported type") + + +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run(dtype: torch.dtype, + MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + results = [] + for m, k, n in MKNs: + timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})") + print_timers(timers) + results.extend(timers) + + return results + + +# output makers +def make_output(data: Iterable[TMeasurement], + MKNs: Iterable[Tuple[int, int, int]], + base_description: str, + timestamp=None): + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +# argparse runners + + +def run_square_bench(args): + dim_sizes = list( + range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) + n = len(dim_sizes) + Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes + Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes + Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes + MKNs = list(zip(Ms, Ks, Ns)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args.dtype, MKNs) + model_bench_data.append(data) + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print_timers(data) + + timestamp = int(time.time()) + + all_data = [] + for d in model_bench_data: + all_data.extend(d) + # pickle all data + with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: + pkl.dump(all_data, f) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError("unsupported dtype") + + parser = FlexibleArgumentParser( + description=""" +Benchmark Cutlass GEMM. + + To run square GEMMs: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']") + subparsers = parser.add_subparsers(dest="cmd") + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--dim-start", type=int, required=True) + range_parser.add_argument("--dim-end", type=int, required=True) + range_parser.add_argument("--dim-increment", type=int, required=True) + range_parser.add_argument("--m-constant", type=int, default=None) + range_parser.add_argument("--n-constant", type=int, default=None) + range_parser.add_argument("--k-constant", type=int, default=None) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/vllm-v0.6.2/benchmarks/cutlass_benchmarks/weight_shapes.py b/vllm-v0.6.2/benchmarks/cutlass_benchmarks/weight_shapes.py new file mode 100644 index 0000000..25ec9d6 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -0,0 +1,43 @@ +# Weight Shapes are in the format +# ([K, N], TP_SPLIT_DIM) +# Example: +# A shape of ([14336, 4096], 0) indicates the following GEMM shape, +# - TP1 : K = 14336, N = 4096 +# - TP2 : K = 7168, N = 4096 +# A shape of ([4096, 6144], 1) indicates the following GEMM shape, +# - TP1 : K = 4096, N = 6144 +# - TP4 : K = 4096, N = 1536 + +# TP1 shapes +WEIGHT_SHAPES = { + "mistralai/Mistral-7B-v0.1": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-7b-hf": [ + ([4096, 12288], 1), + ([4096, 4096], 0), + ([4096, 22016], 1), + ([11008, 4096], 0), + ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-13b-hf": [ + ([5120, 15360], 1), + ([5120, 5120], 0), + ([5120, 27648], 1), + ([13824, 5120], 0), + ], + "meta-llama/Llama-2-70b-hf": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], +} diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_aqlm.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_aqlm.py new file mode 100644 index 0000000..601c4ea --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_aqlm.py @@ -0,0 +1,302 @@ +import os +import sys +from typing import Optional + +import torch +import torch.nn.functional as F + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.aqlm import ( + dequantize_weight, generic_dequantize_gemm, get_int_dtype, + optimized_dequantize_gemm) +from vllm.utils import FlexibleArgumentParser + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def torch_mult( + input: torch.Tensor, # [..., in_features] + weights: torch.Tensor, + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] +) -> torch.Tensor: + output = F.linear(input, weights) + return output + + +def dequant_out_scale( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + if bias is None: + output = F.linear(input, weights, bias) + orig_shape = output.shape + flattened_output = output.view(-1, output.size(-1)) + f_scales = scales.view(-1, scales.shape[0]) + b_scales = f_scales.expand(flattened_output.shape[0], -1) + flattened_output *= b_scales + return flattened_output.view(orig_shape) + else: + b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( + -1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + +def dequant_weight_scale( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( + -1, weights.shape[1]) + weights *= b_scales + return F.linear(input, weights, bias) + + +def dequant_no_scale( + input: torch.Tensor, # [..., in_features] + codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] + codebooks: torch. + Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] + scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + output_partition_sizes: torch.IntTensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + + weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) + + return F.linear(input, weights, bias) + + +# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against +# the generic pytorch version. +# Just visual comparison. +def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: + + n = int(parts.sum().item()) + + device = torch.device('cuda:0') + + code_range = (1 << bits) // 2 + ingroups = 8 + + codes = torch.randint(-code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device) + + codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device) + + count = 0 + for index in range(16): + for i in range(8): + for book in range(nbooks): + codebooks[book, index, 0, i] = count * (10**book) + count += 1 + + print("codes shape", codes.shape) + + for i in range(16): + for book in range(nbooks): + codes[0, i, book] = i + codes[0, -i, book] = i + + weights = dequantize_weight(codes, codebooks, None) + weights2 = ops.aqlm_dequant(codes, codebooks, parts) + + print("weights shape:", weights.shape) + print("weights2 shape:", weights2.shape) + + print("weights are:", weights) + print("weights2 are:", weights2) + + print("first 128 weights are", weights[0, 0:128].to(torch.int32)) + print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) + + print("last 128 weights are", weights[0, -128:]) + print("last 128 weights2 are:", weights2[0, -128:]) + + +def main(): + + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") + + # Add arguments + parser.add_argument("--nbooks", + type=int, + default=1, + help="Number of codebooks (default: 1)") + parser.add_argument("--bits", + type=int, + default=16, + help="Number of bits per code element (default: 16)") + parser.add_argument( + "--test", + type=bool, + default=False, + help="Run the decompression/dequant tester rather than benchmarking " + "(default: False)") + + # Parse the arguments + args = parser.parse_args() + + # Extract values + nbooks = args.nbooks + bits = args.bits + + if args.test: + dequant_test(4096, torch.tensor((4096, )), nbooks, bits) + return + + # Otherwise, benchmark. + methods = [ + ops.aqlm_gemm, + dequant_out_scale, + generic_dequantize_gemm, + optimized_dequantize_gemm, + dequant_weight_scale, + torch_mult, + dequant_no_scale, + ] + + filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" + print(f"writing benchmarks to file {filename}") + with open(filename, "w") as f: + sys.stdout = f + + print('m | k | n | n parts', end='') + for method in methods: + print(f" | {method.__name__.replace('_', ' ')} (µs)", end='') + print('') + + # These are reasonable prefill sizes. + ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )), + (4096, (11008, 11008)), (11008, (4096, ))) + + # reasonable ranges for m. + for m in [ + 1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112, + 128, 256, 512, 1024, 1536, 2048, 3072, 4096 + ]: + print(f'{m}', file=sys.__stdout__) + for ksp in ksandpartions: + run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, + methods) + + sys.stdout = sys.__stdout__ + + +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, + methods): + + # I didn't see visible improvements from increasing these, but feel free :) + num_warmup_trials = 1 + num_trials = 1 + + num_calls = 100 + + # warmup. + for method in methods: + for _ in range(num_warmup_trials): + run_timing( + num_calls=num_calls, + m=m, + k=k, + parts=parts, + nbooks=nbooks, + bits=bits, + method=method, + ) + + n = parts.sum().item() + print(f'{m} | {k} | {n} | {parts.tolist()}', end='') + + for method in methods: + best_time_us = 1e20 + for _ in range(num_trials): + kernel_dur_ms = run_timing( + num_calls=num_calls, + m=m, + k=k, + parts=parts, + nbooks=nbooks, + bits=bits, + method=method, + ) + + kernel_dur_us = 1000 * kernel_dur_ms + + if kernel_dur_us < best_time_us: + best_time_us = kernel_dur_us + + print(f' | {kernel_dur_us:.0f}', end='') + + print('') + + +def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor, + nbooks: int, bits: int, method) -> float: + + n = int(parts.sum().item()) + + device = torch.device('cuda:0') + + input = torch.randn((1, m, k), dtype=torch.float16, device=device) + + code_range = (1 << bits) // 2 + ingroups = 8 + + codes = torch.randint(-code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device) + + codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device) + + scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) + + # for comparison to just a pytorch mult. + weights = torch.randn((n, k), dtype=torch.float16, device=device) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + start_event.record() + + if method is torch_mult: + for i in range(num_calls): + torch_mult(input, weights, scales) + else: + for i in range(num_calls): + method(input, codes, codebooks, scales, parts, None) + + end_event.record() + end_event.synchronize() + + dur_ms = start_event.elapsed_time(end_event) / num_calls + return dur_ms + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_layernorm.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_layernorm.py new file mode 100644 index 0000000..7acea60 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_layernorm.py @@ -0,0 +1,86 @@ +import time + +import torch + +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + + +@torch.inference_mode() +def main(num_tokens: int, + hidden_size: int, + add_residual: bool, + dtype: torch.dtype, + seed: int = 0, + do_profile: bool = False, + num_warmup_iters: int = 5, + num_iters: int = 100) -> None: + current_platform.seed_everything(seed) + torch.set_default_device("cuda") + + layer = RMSNorm(hidden_size).to(dtype=dtype) + layer.weight.data.normal_(mean=1.0, std=0.1) + scale = 1 / (2 * hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + x *= scale + residual = torch.randn_like(x) * scale if add_residual else None + + def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: + torch.cuda.synchronize() + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + for _ in range(num_iters): + layer(x, residual) + torch.cuda.synchronize() + + end_time = time.perf_counter() + if profile: + torch.cuda.cudart().cudaProfilerStart() + return (end_time - start_time) / num_iters + + # Warmup. + print("Warming up...") + run_benchmark = run_cuda_benchmark + run_benchmark(num_iters=num_warmup_iters, profile=False) + + # Benchmark. + if do_profile: + latency = run_benchmark(num_iters=1, profile=True) + else: + latency = run_benchmark(num_iters=num_iters, profile=False) + print(f"Kernel running time: {latency * 1000000:.3f} us") + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description="Benchmark the layernorm kernel.") + parser.add_argument("--num-tokens", type=int, default=4096) + parser.add_argument("--hidden-size", type=int, default=8192) + parser.add_argument("--add-residual", action="store_true") + parser.add_argument("--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="half") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--num-warmup-iters", type=int, default=5) + parser.add_argument("--num-iters", + type=int, + default=100, + help="Number of benchmark iterations. " + "If --profile is set, this number is ignored") + + args = parser.parse_args() + print(args) + + main(num_tokens=args.num_tokens, + hidden_size=args.hidden_size, + add_residual=args.add_residual, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + num_warmup_iters=args.num_warmup_iters, + num_iters=args.num_iters) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_machete.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_machete.py new file mode 100644 index 0000000..665b50b --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_machete.py @@ -0,0 +1,420 @@ +import argparse +import copy +import itertools +import math +import pickle as pkl +import time +from itertools import product +from typing import Callable, Iterable, List, Optional, Tuple + +import pandas as pd +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + MarlinWorkspace) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + gptq_pack, pack_rows, quantize_weights) +from vllm.scalar_type import ScalarType, scalar_types +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"] +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] +DEFAULT_TP_SIZES = [1] + + +def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor: + w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) + w_q = w_q.t().contiguous().t() # make col major + return ops.machete_prepack_B(w_q, wtype) + + +def make_bench_tensors( + atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int, + k: int +) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor, + torch.tensor]]]: + assert wtype.is_integer(), "TODO: support floating point weights" + + # we want to make sure that weights don't fit into L2 cache between runs so + # we construct enough weights to exceed L2 cache, which is 50mb on a H100 + # so we target total weight size > 2*50mb + num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits)) + + a = torch.randn((m, k), device="cuda", dtype=atype) * 5 + weights = [ + torch.randn((k, n), device="cuda", dtype=atype) + for _ in range(num_weights) + ] + quanitized_weights = [ + quantize_weights(w, wtype, group_size) for w in weights + ] + + return a, quanitized_weights + + +# impl + + +# bench +def bench_fn(label: str, sub_label: str, description: str, + fn: Callable) -> TMeasurement: + + min_run_time = 1 + return TBenchmark.Timer( + stmt="fn()", + globals={ + "fn": fn + }, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def loop_over_weights( + a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor, + torch.tensor, torch.tensor]], + fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor], + None]): + for w_ref, w_q, w_s, _ in weights: + fn(a, w_ref, w_q, w_s) + + +_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None +_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None + + +def bench(atype: torch.dtype, + wtype: ScalarType, + group_size: int, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + benchmark_marlinv1: bool = True, + sweep_schedules: bool = True) -> Iterable[TMeasurement]: + global _SWEEP_SCHEDULES_RESULTS + + a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k) + sub_label += f", L={len(weights)}" + + weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp) + for w_ref, w_q, w_s, w_zp in weights] + + timers = [] + # pytorch impl + timers.append( + bench_fn( + label, sub_label, "torch.matmul", lambda: loop_over_weights( + a, + weights, + lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref), + ))) + + if benchmark_marlinv1: + w_ref = weights[0][0] + + w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device) + sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device) + g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device) + + def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor: + w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape) + return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape, + wtype.size_bits) + + def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor: + return marlin_permute_scales(w_s, *w_ref.shape, group_size) + + weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q), + marlinv1_permute_scales(w_s), w_zp) + for w_ref, w_q, w_s, w_zp in weights] + + workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + # marlinv1 + timers.append( + bench_fn( + label, sub_label, "marlin_orig", lambda: loop_over_weights( + a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops. + gptq_marlin_gemm(a, + w_q, + w_s, + w_zp_empty, + g_idx, + sort_indices, + workspace.scratch, + wtype, + size_m=a.shape[0], + size_n=w_ref.shape[1], + size_k=w_ref.shape[0], + is_k_full=True)))) + + # machete + timers.append( + bench_fn( + label, sub_label, "machete_heuristic", lambda: loop_over_weights( + a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm( + a, w_q, wtype, b_scales=w_s, b_group_size=group_size)))) + + if sweep_schedules: + print("Finding best schedule for machete") + best = None + best_schedule = None + schedules = ops.machete_supported_schedules(wtype) + for schedule in reversed(schedules): + schedule_M = int(schedule.split("_")[0].split("x")[1]) + + # Prune known bad schedules + if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4: + continue + + def run(a, _, w_q, w_s, schedule=schedule): + ops.machete_gemm(a, + w_q, + wtype, + w_s, + b_group_size=group_size, + schedule=schedule) + + res = bench_fn(label, sub_label, "machete_best", + lambda: loop_over_weights(a, weights_machete, run)) + + results_row = { + "M": m, + "K": k, + "N": n, + "group_size": group_size, + "schedule": schedule, + "median": res.median, + } + if _SWEEP_SCHEDULES_RESULTS is None: + _SWEEP_SCHEDULES_RESULTS = pd.DataFrame( + columns=results_row.keys()) + _SWEEP_SCHEDULES_RESULTS.\ + loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row + + print(f" {res.median:5.5} ", schedule) + if not best or res.median < best.median: + best = res + best_schedule = schedule + print("Best schedule:", best_schedule) + timers.append(best) + + return timers + + +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run(dtype: torch.dtype, sweep_schedules: bool, + MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + + results = [] + for m, k, n in MKNs: + timers = bench(dtype, + scalar_types.uint4b8, + 128, + m, + k, + n, + f"{dtype}-gemm", + f"MKN=({m}x{k}x{n})", + sweep_schedules=sweep_schedules) + print_timers(timers) + results.extend(timers) + + return results + + +# output makers +def make_output( + data: Iterable[TMeasurement], + MKNs: Iterable[Tuple[int, int, int]], + base_description: str, + timestamp=None, +): + + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +# argparse runners + + +def run_square_bench(args): + dim_sizes = list( + range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + + data = run(args.dtype, args.sweep_schedules, MKNs) + + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) + m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) + m_increment, k_increment, n_increment = \ + (int(x) for x in args.dim_increment.split(",")) + Ms = list(range(m_start, m_end + 1, m_increment)) + Ks = list(range(k_start, k_end + 1, k_increment)) + Ns = list(range(n_start, n_end + 1, n_increment)) + MKNs = list(product(Ms, Ks, Ns)) + + data = run(args.dtype, args.sweep_schedules, MKNs) + + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args.dtype, args.sweep_schedules, MKNs) + model_bench_data.append(data) + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print_timers(data) + + timestamp = int(time.time()) + + all_data = [] + for d in model_bench_data: + all_data.extend(d) + # pickle all data + with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: + pkl.dump(all_data, f) + + +if __name__ == "__main__": + + def to_torch_dtype(dt): + if dt == "bfloat16": + return torch.bfloat16 + if dt == "float16": + return torch.float16 + raise ValueError("unsupported dtype") + + parser = FlexibleArgumentParser( + description=""" +Benchmark Machete GEMM. + + To run square GEMMs: + python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['bfloat16', 'float16']", + ) + parser.add_argument( + "--sweep-schedules", + action="store_true", + help="Run a sweep over all supported schedules", + ) + parser.add_argument("--sweep-csv-out", + help="CSV to store sweep results", + default="sch_sweep_results.csv") + subparsers = parser.add_subparsers(dest="cmd", required=True) + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument( + "--dim-start", + type=str, + required=True, + help="Start value for M,K,N as common separated list") + range_parser.add_argument( + "--dim-end", + type=str, + required=True, + help="End value (inclusive) for M,K,N as common separated list") + range_parser.add_argument( + "--dim-increment", + type=str, + required=True, + help="Increment value for M,K,N as common separated list") + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + + _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out + args.func(args) + + if _SWEEP_SCHEDULES_RESULTS is not None: + _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_marlin.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_marlin.py new file mode 100644 index 0000000..536c133 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_marlin.py @@ -0,0 +1,254 @@ +from typing import List + +import torch +import torch.utils.benchmark as benchmark +from benchmark_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, + MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + MarlinWorkspace, marlin_quantize) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( + marlin_24_quantize) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + gptq_pack, gptq_quantize_weights, sort_weights) +from vllm.scalar_type import ScalarType +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] + +ACT_ORDER_OPTS = [False, True] +K_FULL_OPTS = [False, True] + + +def bench_run(results: List[benchmark.Measurement], model: str, + act_order: bool, is_k_full: bool, quant_type: ScalarType, + group_size: int, size_m: int, size_k: int, size_n: int): + label = "Quant Matmul" + + sub_label = ("{}, act={} k_full={}, q={}, g={}, " + "MKN=({}x{}x{})".format(model, act_order, is_k_full, + str(quant_type), group_size, size_m, + size_k, size_n)) + + print(f"Testing: {sub_label}") + + a = torch.randn(size_m, size_k).to(torch.half).cuda() + b = torch.rand(size_k, size_n).to(torch.half).cuda() + + a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda()) + + # Marlin quant + ( + marlin_w_ref, + marlin_q_w, + marlin_s, + marlin_g_idx, + marlin_sort_indices, + marlin_rand_perm, + ) = marlin_quantize(b, quant_type, group_size, act_order) + + # Marlin_24 quant + (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, + marlin_24_s) = marlin_24_quantize(b, quant_type, group_size) + + marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) + + # GPTQ quant + (w_ref, q_w, s, g_idx, + rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order) + q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) + + # For act_order, sort the "weights" and "g_idx" + # so that group ids are increasing + repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) + if act_order: + (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) + + # Prepare + marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_MAX_PARALLEL) + marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int) + + globals = { + # Gen params + "quant_type": quant_type, + "group_size": group_size, + "size_m": size_m, + "size_n": size_n, + "size_k": size_k, + "a": a, + "a_tmp": a_tmp, + # Marlin params + "marlin_w_ref": marlin_w_ref, + "marlin_q_w": marlin_q_w, + "marlin_s": marlin_s, + "marlin_zp": marlin_zp, + "marlin_g_idx": marlin_g_idx, + "marlin_sort_indices": marlin_sort_indices, + "marlin_rand_perm": marlin_rand_perm, + "marlin_workspace": marlin_workspace, + "is_k_full": is_k_full, + # Marlin_24 params + "marlin_24_w_ref": marlin_24_w_ref, + "marlin_24_q_w_comp": marlin_24_q_w_comp, + "marlin_24_meta": marlin_24_meta, + "marlin_24_s": marlin_24_s, + "marlin_24_workspace": marlin_24_workspace, + # GPTQ params + "q_w_gptq": q_w_gptq, + "repack_sort_indices": repack_sort_indices, + # Kernels + "gptq_marlin_gemm": ops.gptq_marlin_gemm, + "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, + "gptq_marlin_repack": ops.gptq_marlin_repack, + } + + min_run_time = 1 + + # Warmup pytorch + for i in range(5): + torch.matmul(a, marlin_w_ref) + + results.append( + benchmark.Timer( + stmt="torch.matmul(a, marlin_w_ref)", + globals=globals, + label=label, + sub_label=sub_label, + description="pytorch_gemm", + ).blocked_autorange(min_run_time=min_run_time)) + + results.append( + benchmark.Timer( + stmt= + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_gemm_fp16", + ).blocked_autorange(min_run_time=min_run_time)) + + results.append( + benchmark.Timer( + stmt= + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_gemm_fp32", + ).blocked_autorange(min_run_time=min_run_time)) + + if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES): + results.append( + benchmark.Timer( + stmt= + "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_24_gemm", + ).blocked_autorange(min_run_time=min_run_time)) + + results.append( + benchmark.Timer( + stmt= + "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_repack", + ).blocked_autorange(min_run_time=min_run_time)) + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + results: List[benchmark.Measurement] = [] + + for model in args.models: + for layer in WEIGHT_SHAPES[model]: + size_k = layer[0] + size_n = layer[1] + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for act_order in ACT_ORDER_OPTS: + if len(args.limit_act_order + ) > 0 and act_order not in args.limit_act_order: + continue + + for is_k_full in K_FULL_OPTS: + if len(args.limit_k_full + ) > 0 and is_k_full not in args.limit_k_full: + continue + + for quant_type in query_marlin_supported_quant_types( + False): + if len(args.limit_num_bits) > 0 and \ + quant_type.size_bits not in args.limit_num_bits: + continue + + for group_size in MARLIN_SUPPORTED_GROUP_SIZES: + if len( + args.limit_group_size + ) > 0 and group_size not in args.limit_group_size: + continue + + # For act_order, the group_size must be less than + # size_k + if act_order and (group_size == size_k + or group_size == -1): + continue + + for size_m in args.batch_sizes: + bench_run(results, model, act_order, is_k_full, + quant_type, group_size, size_m, + size_k, size_n) + + compare = benchmark.Compare(results) + compare.print() + + +# For quick benchmarking use: +# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 +# +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark Marlin across specified models/shapes/batches") + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + parser.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument("--limit-group-size", nargs="+", type=int, default=[]) + parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[]) + parser.add_argument("--limit-act-order", nargs="+", type=int, default=[]) + parser.add_argument("--limit-k-full", nargs="+", type=int, default=[]) + + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_moe.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_moe.py new file mode 100644 index 0000000..8f538c2 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_moe.py @@ -0,0 +1,367 @@ +import argparse +import time +from datetime import datetime +from typing import Any, Dict, List, Tuple, TypedDict + +import ray +import torch +import triton +from ray.experimental.tqdm_ray import tqdm +from transformers import AutoConfig + +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + + +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int + + +def benchmark_config( + config: BenchmarkConfig, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, +) -> float: + init_dtype = torch.float16 if use_fp8_w8a8 else dtype + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + if use_int8_w8a16: + w1 = torch.randint(-127, + 127, ( + num_experts, + shard_intermediate_size, + hidden_size, + ), + dtype=torch.int8) + w2 = torch.randint(-127, + 127, ( + num_experts, + hidden_size, + shard_intermediate_size // 2, + ), + dtype=torch.int8) + else: + w1 = torch.randn(num_experts, + shard_intermediate_size, + hidden_size, + dtype=init_dtype) + w2 = torch.randn(num_experts, + hidden_size, + shard_intermediate_size // 2, + dtype=init_dtype) + gating_output = torch.randn(num_iters, + num_tokens, + num_experts, + dtype=torch.float32) + + w1_scale = None + w2_scale = None + a1_scale = None + a2_scale = None + if use_int8_w8a16: + w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size), + dtype=torch.float32) + w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) + if use_fp8_w8a8: + w1_scale = torch.randn(num_experts, dtype=torch.float32) + w2_scale = torch.randn(num_experts, dtype=torch.float32) + a1_scale = torch.randn(1, dtype=torch.float32) + a2_scale = torch.randn(1, dtype=torch.float32) + + w1 = w1.to(torch.float8_e4m3fn) + w2 = w2.to(torch.float8_e4m3fn) + + input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) + + def prepare(i: int): + input_gating.copy_(gating_output[i]) + + def run(): + from vllm.model_executor.layers.fused_moe import override_config + with override_config(config): + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) + + # JIT compilation & warmup + run() + torch.cuda.synchronize() + + # Capture 10 invocations with CUDA graph + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(10): + run() + torch.cuda.synchronize() + + # Warmup + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: List[float] = [] + for i in range(num_iters): + prepare(i) + torch.cuda.synchronize() + + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + graph.reset() + return avg + + +def get_configs_compute_bound() -> List[Dict[str, int]]: + # Reduced search space for faster tuning. + # TODO(woosuk): Increase the search space and use a performance model to + # prune the search space. + configs: List[BenchmarkConfig] = [] + for num_stages in [2, 3, 4, 5]: + for block_m in [16, 32, 64, 128, 256]: + for block_k in [64, 128, 256]: + for block_n in [32, 64, 128, 256]: + for num_warps in [4, 8]: + for group_size in [1, 16, 32, 64]: + configs.append({ + "BLOCK_SIZE_M": block_m, + "BLOCK_SIZE_N": block_n, + "BLOCK_SIZE_K": block_k, + "GROUP_SIZE_M": group_size, + "num_warps": num_warps, + "num_stages": num_stages, + }) + return configs + + +@ray.remote(num_gpus=1) +class BenchmarkWorker: + + def __init__(self, seed: int) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(seed) + self.seed = seed + + def benchmark( + self, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + ) -> Tuple[Dict[str, int], float]: + current_platform.seed_everything(self.seed) + dtype_str = get_config_dtype_str(dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8) + # NOTE(woosuk): The current naming convention uses w2.shape[2], which + # is the intermediate size after silu_and_mul. + op_config = get_moe_configs(num_experts, shard_intermediate_size // 2, + dtype_str) + if op_config is None: + config = get_default_config(num_tokens, num_experts, + shard_intermediate_size, hidden_size, + topk, dtype_str) + else: + config = op_config[min(op_config.keys(), + key=lambda x: abs(x - num_tokens))] + kernel_time = benchmark_config(config, num_tokens, num_experts, + shard_intermediate_size, hidden_size, + topk, dtype, use_fp8_w8a8, + use_int8_w8a16) + return config, kernel_time + + def tune( + self, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + search_space: List[Dict[str, int]], + ) -> Dict[str, int]: + best_config = None + best_time = float("inf") + for config in tqdm(search_space): + try: + kernel_time = benchmark_config(config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=10) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config + now = datetime.now() + print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") + assert best_config is not None + return best_config + + +def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: + return { + "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": config["GROUP_SIZE_M"], + "num_warps": config["num_warps"], + "num_stages": config["num_stages"], + } + + +def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, + shard_intermediate_size: int, hidden_size: int, topk: int, + dtype: torch.dtype, use_fp8_w8a8: bool, + use_int8_w8a16: bool) -> None: + dtype_str = get_config_dtype_str(dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8) + + # NOTE(woosuk): The current naming convention uses w2.shape[2], which + # is the intermediate size after silu_and_mul. + filename = get_config_file_name(num_experts, shard_intermediate_size // 2, + dtype_str) + + print(f"Writing best config to {filename}...") + with open(filename, "w") as f: + json.dump(configs, f, indent=4) + f.write("\n") + + +def main(args: argparse.Namespace): + print(args) + + config = AutoConfig.from_pretrained(args.model) + if config.architectures[0] == "DbrxForCausalLM": + E = config.ffn_config.moe_num_experts + topk = config.ffn_config.moe_top_k + intermediate_size = config.ffn_config.ffn_hidden_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + else: + # Default: Mixtral. + E = config.num_local_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + + hidden_size = config.hidden_size + dtype = config.torch_dtype + use_fp8_w8a8 = args.dtype == "fp8_w8a8" + use_int8_w8a16 = args.dtype == "int8_w8a16" + + if args.batch_size is None: + batch_sizes = [ + 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, + 2048, 3072, 4096 + ] + else: + batch_sizes = [args.batch_size] + + ray.init() + num_gpus = int(ray.available_resources()["GPU"]) + workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] + + def _distribute(method: str, inputs: List[Any]) -> List[Any]: + outputs = [] + worker_idx = 0 + for input_args in inputs: + worker = workers[worker_idx] + worker_method = getattr(worker, method) + output = worker_method.remote(*input_args) + outputs.append(output) + worker_idx = (worker_idx + 1) % num_gpus + return ray.get(outputs) + + if args.tune: + search_space = get_configs_compute_bound() + print(f"Start tuning over {len(search_space)} configurations...") + + start = time.time() + configs = _distribute( + "tune", [(batch_size, E, shard_intermediate_size, hidden_size, + topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space) + for batch_size in batch_sizes]) + best_configs = { + M: sort_config(config) + for M, config in zip(batch_sizes, configs) + } + save_configs(best_configs, E, shard_intermediate_size, hidden_size, + topk, dtype, use_fp8_w8a8, use_int8_w8a16) + end = time.time() + print(f"Tuning took {end - start:.2f} seconds") + else: + outputs = _distribute( + "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size, + topk, dtype, use_fp8_w8a8, use_int8_w8a16) + for batch_size in batch_sizes]) + + for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): + print(f"Batch size: {batch_size}, config: {config}") + print(f"Kernel time: {kernel_time:.2f} us") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser.add_argument("--model", + type=str, + default="mistralai/Mixtral-8x7B-Instruct-v0.1") + parser.add_argument("--tp-size", "-tp", type=int, default=2) + parser.add_argument("--dtype", + type=str, + choices=["auto", "fp8_w8a8", "int8_w8a16"], + default="auto") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--tune", action="store_true") + args = parser.parse_args() + + main(args) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_paged_attention.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_paged_attention.py new file mode 100644 index 0000000..e5fc197 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_paged_attention.py @@ -0,0 +1,234 @@ +import random +import time +from typing import List, Optional + +import torch +from vllm import _mlu_ops as mlu_ops +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, + create_kv_caches_with_random) +import torch_mlu.utils.gpu_migration + +NUM_BLOCKS = 1024 +PARTITION_SIZE = 512 + + +@torch.inference_mode() +def main( + version: str, + num_seqs: int, + seq_len: int, + num_query_heads: int, + num_kv_heads: int, + head_size: int, + use_alibi: bool, + block_size: int, + dtype: torch.dtype, + seed: int, + do_profile: bool, + device: str = "cuda", + kv_cache_dtype: Optional[str] = None, +) -> None: + current_platform.seed_everything(seed) + + scale = float(1.0 / (head_size**0.5)) + query = torch.empty(num_seqs, + num_query_heads, + head_size, + dtype=dtype, + device=device) + query.uniform_(-scale, scale) + + assert num_query_heads % num_kv_heads == 0 + alibi_slopes = None + if use_alibi: + alibi_slopes = torch.randn(num_query_heads, + dtype=torch.float, + device=device) + + seq_lens = [seq_len for _ in range(num_seqs)] + max_seq_len = max(seq_lens) + seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device) + + # Create the block tables. + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables_lst: List[List[int]] = [] + for _ in range(num_seqs): + block_table = [ + random.randint(0, NUM_BLOCKS - 1) + for _ in range(max_num_blocks_per_seq) + ] + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, + dtype=torch.int, + device=device) + + # Create the KV cache. + key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, + block_size, + 1, + num_kv_heads, + head_size, + kv_cache_dtype, + dtype, + device=device) + key_cache, value_cache = key_caches[0], value_caches[0] + + if version == "tmo": + key_cache = key_cache.reshape(NUM_BLOCKS, num_kv_heads, block_size, head_size) + value_cache = value_cache.reshape(NUM_BLOCKS, num_kv_heads, block_size, head_size) + # Prepare for the paged attention kernel. + output = torch.empty_like(query) + if version == "v2": + num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) + tmp_output = torch.empty( + size=(num_seqs, num_query_heads, num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_query_heads, num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + + def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: + if version == "tmo": + decode_query = query.view(-1, 1, num_query_heads, head_size) + decode_out = torch.empty_like(decode_query) + + torch.cuda.synchronize() + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + # Using default kv_scale + k_scale = v_scale = 1.0 + + for _ in range(num_iters): + if version == "v1": + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) + elif version == "v2": + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) + elif version == "tmo": + mlu_ops.single_query_cached_kv_attn(decode_query, + key_cache, + value_cache, + decode_out, + block_tables, + seq_lens, + None, #k_cache_quant_scale + None, #v_cache_quant_scale + alibi_slopes, + max_seq_len, + -1, # windows_size_left + -1, # windows_size_right + scale) + else: + raise ValueError(f"Invalid version: {version}") + torch.cuda.synchronize() + + end_time = time.perf_counter() + if profile: + torch.cuda.cudart().cudaProfilerStart() + return (end_time - start_time) / num_iters + + # Warmup. + print("Warming up...") + run_benchmark = run_cuda_benchmark + run_benchmark(num_iters=3, profile=False) + + # Benchmark. + if do_profile: + latency = run_benchmark(num_iters=1, profile=True) + else: + latency = run_benchmark(num_iters=100, profile=False) + print(f"Kernel running time: {latency * 1000000:.3f} us") + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description="Benchmark the paged attention kernel.") + parser.add_argument("--version", + type=str, + choices=["v1", "v2", "tmo"], + default="tmo") + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--seq-len", type=int, default=4096) + parser.add_argument("--num-query-heads", type=int, default=64) + parser.add_argument("--num-kv-heads", type=int, default=8) + parser.add_argument("--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--use-alibi", action="store_true") + parser.add_argument("--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="half") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--profile", action="store_true") + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"], + default="auto", + help="Data type for kv cache storage. If 'auto', will use model " + "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. " + "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)") + args = parser.parse_args() + print(args) + + if args.num_query_heads % args.num_kv_heads != 0: + raise ValueError("num_query_heads must be divisible by num_kv_heads") + main( + version=args.version, + num_seqs=args.batch_size, + seq_len=args.seq_len, + num_query_heads=args.num_query_heads, + num_kv_heads=args.num_kv_heads, + head_size=args.head_size, + block_size=args.block_size, + use_alibi=args.use_alibi, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + kv_cache_dtype=args.kv_cache_dtype, + ) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_quant.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_quant.py new file mode 100644 index 0000000..1d62483 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_quant.py @@ -0,0 +1,100 @@ +import time + +import torch + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + + +@torch.inference_mode() +def main(num_tokens: int, + hidden_size: int, + static_scale: bool, + quant_dtype: torch.dtype, + dtype: torch.dtype, + seed: int = 0, + do_profile: bool = False, + num_warmup_iters: int = 5, + num_iters: int = 100) -> None: + current_platform.seed_everything(seed) + torch.set_default_device("cuda") + + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None + + def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: + torch.cuda.synchronize() + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + for _ in range(num_iters): + if quant_dtype == torch.int8: + ops.scaled_int8_quant(x, scale) + else: + ops.scaled_fp8_quant(x, scale) + torch.cuda.synchronize() + + end_time = time.perf_counter() + if profile: + torch.cuda.cudart().cudaProfilerStart() + return (end_time - start_time) / num_iters + + # Warmup. + print("Warming up...") + run_benchmark = run_cuda_benchmark + run_benchmark(num_iters=num_warmup_iters, profile=False) + + # Benchmark. + if do_profile: + latency = run_benchmark(num_iters=1, profile=True) + else: + latency = run_benchmark(num_iters=num_iters, profile=False) + print(f"Kernel running time: {latency * 1000000:.3f} us") + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError(f"Unsupported dtype: {dt}") + + parser = FlexibleArgumentParser( + description="Benchmark the quantization (fp8 or int8) kernel.") + parser.add_argument("--num-tokens", type=int, default=4096) + parser.add_argument("--hidden-size", type=int, default=8192) + parser.add_argument("--static-scale", action="store_true") + parser.add_argument("--quant-dtype", + type=str, + choices=["fp8", "int8"], + default="int8") + parser.add_argument("--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="half") + + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--num-warmup-iters", type=int, default=5) + parser.add_argument("--num-iters", + type=int, + default=100, + help="Number of benchmark iterations. " + "If --profile is set, this number is ignored") + + args = parser.parse_args() + print(args) + + main(num_tokens=args.num_tokens, + hidden_size=args.hidden_size, + static_scale=args.static_scale, + quant_dtype=to_torch_dtype(args.quant_dtype), + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + num_warmup_iters=args.num_warmup_iters, + num_iters=args.num_iters) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_rope.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_rope.py new file mode 100644 index 0000000..250d505 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_rope.py @@ -0,0 +1,121 @@ +from itertools import accumulate +from typing import List, Optional + +import nvtx +import torch + +from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, + get_rope) +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + + +def benchmark_rope_kernels_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + # silulating serving 4 LoRAs + scaling_factors = [1, 2, 4, 8] + # batched RoPE can take multiple scaling factors + batched_rope = get_rope(head_size, rotary_dim, max_position, base, + is_neox_style, { + "rope_type": "linear", + "factor": tuple(scaling_factors) + }) + # non-batched RoPE takes only one scaling factor, we create multiple + # instances to simulate the same behavior + non_batched_ropes: List[RotaryEmbedding] = [] + for scaling_factor in scaling_factors: + non_batched_ropes.append( + get_rope(head_size, rotary_dim, max_position, base, is_neox_style, + { + "rope_type": "linear", + "factor": (scaling_factor, ) + })) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # create query offsets for batched RoPE, we concat multiple kv cache + # together and each query needs to find the right kv cache of its type + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + # map query types to offsets + query_offsets = offset_map[query_types] + # the kernel takes flattened offsets + flatten_offsets = query_offsets.flatten() + + # batched queries of the same type together for non-batched RoPE + queries = [query[query_types == i] for i in range(len(scaling_factors))] + keys = [key[query_types == i] for i in range(len(scaling_factors))] + packed_qkr = zip(queries, keys, non_batched_ropes) + # synchronize before start timing + torch.cuda.synchronize() + with nvtx.annotate("non-batched", color="yellow"): + for q, k, r in packed_qkr: + r.forward(positions, q, k) + torch.cuda.synchronize() + with nvtx.annotate("batched", color="green"): + batched_rope.forward(positions, query, key, flatten_offsets) + torch.cuda.synchronize() + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description="Benchmark the rotary embedding kernels.") + parser.add_argument("--is-neox-style", type=bool, default=True) + parser.add_argument("--batch-size", type=int, default=16) + parser.add_argument("--seq-len", type=int, default=512) + parser.add_argument("--num-heads", type=int, default=8) + parser.add_argument("--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128) + parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) + parser.add_argument("--dtype", + type=str, + choices=["bfloat16", "float"], + default="float") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--device", + type=str, + choices=["cuda:0", "cuda:1"], + default="cuda:0") + args = parser.parse_args() + print(args) + + benchmark_rope_kernels_multi_lora( + is_neox_style=args.is_neox_style, + batch_size=args.batch_size, + seq_len=args.seq_len, + num_heads=args.num_heads, + head_size=args.head_size, + rotary_dim=args.rotary_dim, + dtype=getattr(torch, args.dtype), + seed=args.seed, + device=args.device, + ) diff --git a/vllm-v0.6.2/benchmarks/kernels/benchmark_shapes.py b/vllm-v0.6.2/benchmarks/kernels/benchmark_shapes.py new file mode 100644 index 0000000..4eeeca3 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/benchmark_shapes.py @@ -0,0 +1,75 @@ +WEIGHT_SHAPES = { + "ideal": [[4 * 256 * 32, 256 * 32]], + "mistralai/Mistral-7B-v0.1/TP1": [ + [4096, 6144], + [4096, 4096], + [4096, 28672], + [14336, 4096], + ], + "mistralai/Mistral-7B-v0.1/TP2": [ + [4096, 3072], + [2048, 4096], + [4096, 14336], + [7168, 4096], + ], + "mistralai/Mistral-7B-v0.1/TP4": [ + [4096, 1536], + [1024, 4096], + [4096, 7168], + [3584, 4096], + ], + "meta-llama/Llama-2-7b-hf/TP1": [ + [4096, 12288], + [4096, 4096], + [4096, 22016], + [11008, 4096], + ], + "meta-llama/Llama-2-7b-hf/TP2": [ + [4096, 6144], + [2048, 4096], + [4096, 11008], + [5504, 4096], + ], + "meta-llama/Llama-2-7b-hf/TP4": [ + [4096, 3072], + [1024, 4096], + [4096, 5504], + [2752, 4096], + ], + "meta-llama/Llama-2-13b-hf/TP1": [ + [5120, 15360], + [5120, 5120], + [5120, 27648], + [13824, 5120], + ], + "meta-llama/Llama-2-13b-hf/TP2": [ + [5120, 7680], + [2560, 5120], + [5120, 13824], + [6912, 5120], + ], + "meta-llama/Llama-2-13b-hf/TP4": [ + [5120, 3840], + [1280, 5120], + [5120, 6912], + [3456, 5120], + ], + "meta-llama/Llama-2-70b-hf/TP1": [ + [8192, 10240], + [8192, 8192], + [8192, 57344], + [28672, 8192], + ], + "meta-llama/Llama-2-70b-hf/TP2": [ + [8192, 5120], + [4096, 8192], + [8192, 28672], + [14336, 8192], + ], + "meta-llama/Llama-2-70b-hf/TP4": [ + [8192, 2560], + [2048, 8192], + [8192, 14336], + [7168, 8192], + ], +} diff --git a/vllm-v0.6.2/benchmarks/kernels/graph_machete_bench.py b/vllm-v0.6.2/benchmarks/kernels/graph_machete_bench.py new file mode 100644 index 0000000..de608fd --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/graph_machete_bench.py @@ -0,0 +1,62 @@ +import math +import pickle +import re +from collections import defaultdict +from typing import List + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from torch.utils.benchmark import Measurement as TMeasurement + +from vllm.utils import FlexibleArgumentParser + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('filename', type=str) + + args = parser.parse_args() + + with open(args.filename, 'rb') as f: + data: List[TMeasurement] = pickle.load(f) + + results = defaultdict(lambda: list()) + for v in data: + result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label) + if result is not None: + KN = result.group(1) + else: + raise Exception("MKN not found") + result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label) + if result is not None: + M = result.group(1) + else: + raise Exception("MKN not found") + + kernel = v.task_spec.description + results[KN].append({ + "kernel": kernel, + "batch_size": M, + "median": v.median + }) + + rows = int(math.ceil(len(results) / 2)) + fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows)) + axs = axs.flatten() + for axs_idx, (shape, data) in enumerate(results.items()): + plt.sca(axs[axs_idx]) + df = pd.DataFrame(data) + sns.lineplot(data=df, + x="batch_size", + y="median", + hue="kernel", + style="kernel", + markers=True, + dashes=False, + palette="Dark2") + plt.title(f"Shape: {shape}") + plt.ylabel("time (median, s)") + plt.tight_layout() + plt.savefig("graph_machete_bench.pdf") diff --git a/vllm-v0.6.2/benchmarks/kernels/requirements.txt b/vllm-v0.6.2/benchmarks/kernels/requirements.txt new file mode 100644 index 0000000..1411a4a --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/requirements.txt @@ -0,0 +1 @@ +pandas \ No newline at end of file diff --git a/vllm-v0.6.2/benchmarks/kernels/weight_shapes.py b/vllm-v0.6.2/benchmarks/kernels/weight_shapes.py new file mode 100644 index 0000000..25ec9d6 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/kernels/weight_shapes.py @@ -0,0 +1,43 @@ +# Weight Shapes are in the format +# ([K, N], TP_SPLIT_DIM) +# Example: +# A shape of ([14336, 4096], 0) indicates the following GEMM shape, +# - TP1 : K = 14336, N = 4096 +# - TP2 : K = 7168, N = 4096 +# A shape of ([4096, 6144], 1) indicates the following GEMM shape, +# - TP1 : K = 4096, N = 6144 +# - TP4 : K = 4096, N = 1536 + +# TP1 shapes +WEIGHT_SHAPES = { + "mistralai/Mistral-7B-v0.1": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-7b-hf": [ + ([4096, 12288], 1), + ([4096, 4096], 0), + ([4096, 22016], 1), + ([11008, 4096], 0), + ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-13b-hf": [ + ([5120, 15360], 1), + ([5120, 5120], 0), + ([5120, 27648], 1), + ([13824, 5120], 0), + ], + "meta-llama/Llama-2-70b-hf": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], +} diff --git a/vllm-v0.6.2/benchmarks/launch_tgi_server.sh b/vllm-v0.6.2/benchmarks/launch_tgi_server.sh new file mode 100755 index 0000000..ba7383d --- /dev/null +++ b/vllm-v0.6.2/benchmarks/launch_tgi_server.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +PORT=8000 +MODEL=$1 +TOKENS=$2 + +docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ + -v "$PWD/data:/data" \ + ghcr.io/huggingface/text-generation-inference:2.2.0 \ + --model-id "$MODEL" \ + --sharded false \ + --max-input-length 1024 \ + --max-total-tokens 2048 \ + --max-best-of 5 \ + --max-concurrent-requests 5000 \ + --max-batch-total-tokens "$TOKENS" diff --git a/vllm-v0.6.2/benchmarks/overheads/benchmark_hashing.py b/vllm-v0.6.2/benchmarks/overheads/benchmark_hashing.py new file mode 100644 index 0000000..d16d6f9 --- /dev/null +++ b/vllm-v0.6.2/benchmarks/overheads/benchmark_hashing.py @@ -0,0 +1,59 @@ +import cProfile +import pstats + +from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser + +# A very long prompt, total number of tokens is about 15k. +LONG_PROMPT = ["You are an expert in large language models, aren't you?" + ] * 1000 +LONG_PROMPT = ' '.join(LONG_PROMPT) + + +def main(args): + llm = LLM( + model=args.model, + enforce_eager=True, + enable_prefix_caching=True, + tensor_parallel_size=args.tensor_parallel_size, + ) + + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + profiler = cProfile.Profile() + + print("------warm up------") + for i in range(3): + output = llm.generate(LONG_PROMPT, sampling_params) + print(output[0].outputs[0].text) + + print("------start generating------") + for i in range(3): + profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)', + globals(), locals()) + + # analyze the runtime of hashing function + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') + total_time = 0 + total_calls = 0 + for func in stats.stats: + if 'hash_of_block' in func[2]: + total_time = stats.stats[func][3] + total_calls = stats.stats[func][0] + percentage = (total_time / stats.total_tt) * 100 + print(f"Hashing took {total_time:.2f} seconds," + f"{percentage:.2f}% of the total runtime.") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Benchmark the performance of hashing function in' + 'automatic prefix caching.') + parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') + parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) + parser.add_argument('--output-len', type=int, default=10) + parser.add_argument('--enable-prefix-caching', + action='store_true', + help='enable prefix caching') + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/benchmarks/sonnet.txt b/vllm-v0.6.2/benchmarks/sonnet.txt new file mode 100644 index 0000000..34c444e --- /dev/null +++ b/vllm-v0.6.2/benchmarks/sonnet.txt @@ -0,0 +1,518 @@ +FROM fairest creatures we desire increase, +That thereby beauty's rose might never die, +But as the riper should by time decease, +His tender heir might bear his memory: +But thou, contracted to thine own bright eyes, +Feed'st thy light'st flame with self-substantial fuel, +Making a famine where abundance lies, +Thyself thy foe, to thy sweet self too cruel. +Thou that art now the world's fresh ornament +And only herald to the gaudy spring, +Within thine own bud buriest thy content +And, tender churl, makest waste in niggarding. +Pity the world, or else this glutton be, +To eat the world's due, by the grave and thee. +When forty winters shall beseige thy brow, +And dig deep trenches in thy beauty's field, +Thy youth's proud livery, so gazed on now, +Will be a tatter'd weed, of small worth held: +Then being ask'd where all thy beauty lies, +Where all the treasure of thy lusty days, +To say, within thine own deep-sunken eyes, +Were an all-eating shame and thriftless praise. +How much more praise deserved thy beauty's use, +If thou couldst answer 'This fair child of mine +Shall sum my count and make my old excuse,' +Proving his beauty by succession thine! +This were to be new made when thou art old, +And see thy blood warm when thou feel'st it cold. +Look in thy glass, and tell the face thou viewest +Now is the time that face should form another; +Whose fresh repair if now thou not renewest, +Thou dost beguile the world, unbless some mother. +For where is she so fair whose unear'd womb +Disdains the tillage of thy husbandry? +Or who is he so fond will be the tomb +Of his self-love, to stop posterity? +Thou art thy mother's glass, and she in thee +Calls back the lovely April of her prime: +So thou through windows of thine age shall see +Despite of wrinkles this thy golden time. +But if thou live, remember'd not to be, +Die single, and thine image dies with thee. +Unthrifty loveliness, why dost thou spend +Upon thyself thy beauty's legacy? +Nature's bequest gives nothing but doth lend, +And being frank she lends to those are free. +Then, beauteous niggard, why dost thou abuse +The bounteous largess given thee to give? +Profitless usurer, why dost thou use +So great a sum of sums, yet canst not live? +For having traffic with thyself alone, +Thou of thyself thy sweet self dost deceive. +Then how, when nature calls thee to be gone, +What acceptable audit canst thou leave? +Thy unused beauty must be tomb'd with thee, +Which, used, lives th' executor to be. +Those hours, that with gentle work did frame +The lovely gaze where every eye doth dwell, +Will play the tyrants to the very same +And that unfair which fairly doth excel: +For never-resting time leads summer on +To hideous winter and confounds him there; +Sap cheque'd with frost and lusty leaves quite gone, +Beauty o'ersnow'd and bareness every where: +Then, were not summer's distillation left, +A liquid prisoner pent in walls of glass, +Beauty's effect with beauty were bereft, +Nor it nor no remembrance what it was: +But flowers distill'd though they with winter meet, +Leese but their show; their substance still lives sweet. +Then let not winter's ragged hand deface +In thee thy summer, ere thou be distill'd: +Make sweet some vial; treasure thou some place +With beauty's treasure, ere it be self-kill'd. +That use is not forbidden usury, +Which happies those that pay the willing loan; +That's for thyself to breed another thee, +Or ten times happier, be it ten for one; +Ten times thyself were happier than thou art, +If ten of thine ten times refigured thee: +Then what could death do, if thou shouldst depart, +Leaving thee living in posterity? +Be not self-will'd, for thou art much too fair +To be death's conquest and make worms thine heir. +Lo! in the orient when the gracious light +Lifts up his burning head, each under eye +Doth homage to his new-appearing sight, +Serving with looks his sacred majesty; +And having climb'd the steep-up heavenly hill, +Resembling strong youth in his middle age, +yet mortal looks adore his beauty still, +Attending on his golden pilgrimage; +But when from highmost pitch, with weary car, +Like feeble age, he reeleth from the day, +The eyes, 'fore duteous, now converted are +From his low tract and look another way: +So thou, thyself out-going in thy noon, +Unlook'd on diest, unless thou get a son. +Music to hear, why hear'st thou music sadly? +Sweets with sweets war not, joy delights in joy. +Why lovest thou that which thou receivest not gladly, +Or else receivest with pleasure thine annoy? +If the true concord of well-tuned sounds, +By unions married, do offend thine ear, +They do but sweetly chide thee, who confounds +In singleness the parts that thou shouldst bear. +Mark how one string, sweet husband to another, +Strikes each in each by mutual ordering, +Resembling sire and child and happy mother +Who all in one, one pleasing note do sing: +Whose speechless song, being many, seeming one, +Sings this to thee: 'thou single wilt prove none.' +Is it for fear to wet a widow's eye +That thou consumest thyself in single life? +Ah! if thou issueless shalt hap to die. +The world will wail thee, like a makeless wife; +The world will be thy widow and still weep +That thou no form of thee hast left behind, +When every private widow well may keep +By children's eyes her husband's shape in mind. +Look, what an unthrift in the world doth spend +Shifts but his place, for still the world enjoys it; +But beauty's waste hath in the world an end, +And kept unused, the user so destroys it. +No love toward others in that bosom sits +That on himself such murderous shame commits. +For shame! deny that thou bear'st love to any, +Who for thyself art so unprovident. +Grant, if thou wilt, thou art beloved of many, +But that thou none lovest is most evident; +For thou art so possess'd with murderous hate +That 'gainst thyself thou stick'st not to conspire. +Seeking that beauteous roof to ruinate +Which to repair should be thy chief desire. +O, change thy thought, that I may change my mind! +Shall hate be fairer lodged than gentle love? +Be, as thy presence is, gracious and kind, +Or to thyself at least kind-hearted prove: +Make thee another self, for love of me, +That beauty still may live in thine or thee. +As fast as thou shalt wane, so fast thou growest +In one of thine, from that which thou departest; +And that fresh blood which youngly thou bestowest +Thou mayst call thine when thou from youth convertest. +Herein lives wisdom, beauty and increase: +Without this, folly, age and cold decay: +If all were minded so, the times should cease +And threescore year would make the world away. +Let those whom Nature hath not made for store, +Harsh featureless and rude, barrenly perish: +Look, whom she best endow'd she gave the more; +Which bounteous gift thou shouldst in bounty cherish: +She carved thee for her seal, and meant thereby +Thou shouldst print more, not let that copy die. +When I do count the clock that tells the time, +And see the brave day sunk in hideous night; +When I behold the violet past prime, +And sable curls all silver'd o'er with white; +When lofty trees I see barren of leaves +Which erst from heat did canopy the herd, +And summer's green all girded up in sheaves +Borne on the bier with white and bristly beard, +Then of thy beauty do I question make, +That thou among the wastes of time must go, +Since sweets and beauties do themselves forsake +And die as fast as they see others grow; +And nothing 'gainst Time's scythe can make defence +Save breed, to brave him when he takes thee hence. +O, that you were yourself! but, love, you are +No longer yours than you yourself here live: +Against this coming end you should prepare, +And your sweet semblance to some other give. +So should that beauty which you hold in lease +Find no determination: then you were +Yourself again after yourself's decease, +When your sweet issue your sweet form should bear. +Who lets so fair a house fall to decay, +Which husbandry in honour might uphold +Against the stormy gusts of winter's day +And barren rage of death's eternal cold? +O, none but unthrifts! Dear my love, you know +You had a father: let your son say so. +Not from the stars do I my judgment pluck; +And yet methinks I have astronomy, +But not to tell of good or evil luck, +Of plagues, of dearths, or seasons' quality; +Nor can I fortune to brief minutes tell, +Pointing to each his thunder, rain and wind, +Or say with princes if it shall go well, +By oft predict that I in heaven find: +But from thine eyes my knowledge I derive, +And, constant stars, in them I read such art +As truth and beauty shall together thrive, +If from thyself to store thou wouldst convert; +Or else of thee this I prognosticate: +Thy end is truth's and beauty's doom and date. +When I consider every thing that grows +Holds in perfection but a little moment, +That this huge stage presenteth nought but shows +Whereon the stars in secret influence comment; +When I perceive that men as plants increase, +Cheered and cheque'd even by the self-same sky, +Vaunt in their youthful sap, at height decrease, +And wear their brave state out of memory; +Then the conceit of this inconstant stay +Sets you most rich in youth before my sight, +Where wasteful Time debateth with Decay, +To change your day of youth to sullied night; +And all in war with Time for love of you, +As he takes from you, I engraft you new. +But wherefore do not you a mightier way +Make war upon this bloody tyrant, Time? +And fortify yourself in your decay +With means more blessed than my barren rhyme? +Now stand you on the top of happy hours, +And many maiden gardens yet unset +With virtuous wish would bear your living flowers, +Much liker than your painted counterfeit: +So should the lines of life that life repair, +Which this, Time's pencil, or my pupil pen, +Neither in inward worth nor outward fair, +Can make you live yourself in eyes of men. +To give away yourself keeps yourself still, +And you must live, drawn by your own sweet skill. +Who will believe my verse in time to come, +If it were fill'd with your most high deserts? +Though yet, heaven knows, it is but as a tomb +Which hides your life and shows not half your parts. +If I could write the beauty of your eyes +And in fresh numbers number all your graces, +The age to come would say 'This poet lies: +Such heavenly touches ne'er touch'd earthly faces.' +So should my papers yellow'd with their age +Be scorn'd like old men of less truth than tongue, +And your true rights be term'd a poet's rage +And stretched metre of an antique song: +But were some child of yours alive that time, +You should live twice; in it and in my rhyme. +Shall I compare thee to a summer's day? +Thou art more lovely and more temperate: +Rough winds do shake the darling buds of May, +And summer's lease hath all too short a date: +Sometime too hot the eye of heaven shines, +And often is his gold complexion dimm'd; +And every fair from fair sometime declines, +By chance or nature's changing course untrimm'd; +But thy eternal summer shall not fade +Nor lose possession of that fair thou owest; +Nor shall Death brag thou wander'st in his shade, +When in eternal lines to time thou growest: +So long as men can breathe or eyes can see, +So long lives this and this gives life to thee. +Devouring Time, blunt thou the lion's paws, +And make the earth devour her own sweet brood; +Pluck the keen teeth from the fierce tiger's jaws, +And burn the long-lived phoenix in her blood; +Make glad and sorry seasons as thou fleets, +And do whate'er thou wilt, swift-footed Time, +To the wide world and all her fading sweets; +But I forbid thee one most heinous crime: +O, carve not with thy hours my love's fair brow, +Nor draw no lines there with thine antique pen; +Him in thy course untainted do allow +For beauty's pattern to succeeding men. +Yet, do thy worst, old Time: despite thy wrong, +My love shall in my verse ever live young. +A woman's face with Nature's own hand painted +Hast thou, the master-mistress of my passion; +A woman's gentle heart, but not acquainted +With shifting change, as is false women's fashion; +An eye more bright than theirs, less false in rolling, +Gilding the object whereupon it gazeth; +A man in hue, all 'hues' in his controlling, +Much steals men's eyes and women's souls amazeth. +And for a woman wert thou first created; +Till Nature, as she wrought thee, fell a-doting, +And by addition me of thee defeated, +By adding one thing to my purpose nothing. +But since she prick'd thee out for women's pleasure, +Mine be thy love and thy love's use their treasure. +So is it not with me as with that Muse +Stirr'd by a painted beauty to his verse, +Who heaven itself for ornament doth use +And every fair with his fair doth rehearse +Making a couplement of proud compare, +With sun and moon, with earth and sea's rich gems, +With April's first-born flowers, and all things rare +That heaven's air in this huge rondure hems. +O' let me, true in love, but truly write, +And then believe me, my love is as fair +As any mother's child, though not so bright +As those gold candles fix'd in heaven's air: +Let them say more than like of hearsay well; +I will not praise that purpose not to sell. +My glass shall not persuade me I am old, +So long as youth and thou are of one date; +But when in thee time's furrows I behold, +Then look I death my days should expiate. +For all that beauty that doth cover thee +Is but the seemly raiment of my heart, +Which in thy breast doth live, as thine in me: +How can I then be elder than thou art? +O, therefore, love, be of thyself so wary +As I, not for myself, but for thee will; +Bearing thy heart, which I will keep so chary +As tender nurse her babe from faring ill. +Presume not on thy heart when mine is slain; +Thou gavest me thine, not to give back again. +As an unperfect actor on the stage +Who with his fear is put besides his part, +Or some fierce thing replete with too much rage, +Whose strength's abundance weakens his own heart. +So I, for fear of trust, forget to say +The perfect ceremony of love's rite, +And in mine own love's strength seem to decay, +O'ercharged with burden of mine own love's might. +O, let my books be then the eloquence +And dumb presagers of my speaking breast, +Who plead for love and look for recompense +More than that tongue that more hath more express'd. +O, learn to read what silent love hath writ: +To hear with eyes belongs to love's fine wit. +Mine eye hath play'd the painter and hath stell'd +Thy beauty's form in table of my heart; +My body is the frame wherein 'tis held, +And perspective it is the painter's art. +For through the painter must you see his skill, +To find where your true image pictured lies; +Which in my bosom's shop is hanging still, +That hath his windows glazed with thine eyes. +Now see what good turns eyes for eyes have done: +Mine eyes have drawn thy shape, and thine for me +Are windows to my breast, where-through the sun +Delights to peep, to gaze therein on thee; +Yet eyes this cunning want to grace their art; +They draw but what they see, know not the heart. +Let those who are in favour with their stars +Of public honour and proud titles boast, +Whilst I, whom fortune of such triumph bars, +Unlook'd for joy in that I honour most. +Great princes' favourites their fair leaves spread +But as the marigold at the sun's eye, +And in themselves their pride lies buried, +For at a frown they in their glory die. +The painful warrior famoused for fight, +After a thousand victories once foil'd, +Is from the book of honour razed quite, +And all the rest forgot for which he toil'd: +Then happy I, that love and am beloved +Where I may not remove nor be removed. +Lord of my love, to whom in vassalage +Thy merit hath my duty strongly knit, +To thee I send this written embassage, +To witness duty, not to show my wit: +Duty so great, which wit so poor as mine +May make seem bare, in wanting words to show it, +But that I hope some good conceit of thine +In thy soul's thought, all naked, will bestow it; +Till whatsoever star that guides my moving +Points on me graciously with fair aspect +And puts apparel on my tatter'd loving, +To show me worthy of thy sweet respect: +Then may I dare to boast how I do love thee; +Till then not show my head where thou mayst prove me. +Weary with toil, I haste me to my bed, +The dear repose for limbs with travel tired; +But then begins a journey in my head, +To work my mind, when body's work's expired: +For then my thoughts, from far where I abide, +Intend a zealous pilgrimage to thee, +And keep my drooping eyelids open wide, +Looking on darkness which the blind do see +Save that my soul's imaginary sight +Presents thy shadow to my sightless view, +Which, like a jewel hung in ghastly night, +Makes black night beauteous and her old face new. +Lo! thus, by day my limbs, by night my mind, +For thee and for myself no quiet find. +How can I then return in happy plight, +That am debarr'd the benefit of rest? +When day's oppression is not eased by night, +But day by night, and night by day, oppress'd? +And each, though enemies to either's reign, +Do in consent shake hands to torture me; +The one by toil, the other to complain +How far I toil, still farther off from thee. +I tell the day, to please them thou art bright +And dost him grace when clouds do blot the heaven: +So flatter I the swart-complexion'd night, +When sparkling stars twire not thou gild'st the even. +But day doth daily draw my sorrows longer +And night doth nightly make grief's strength seem stronger. +When, in disgrace with fortune and men's eyes, +I all alone beweep my outcast state +And trouble deal heaven with my bootless cries +And look upon myself and curse my fate, +Wishing me like to one more rich in hope, +Featured like him, like him with friends possess'd, +Desiring this man's art and that man's scope, +With what I most enjoy contented least; +Yet in these thoughts myself almost despising, +Haply I think on thee, and then my state, +Like to the lark at break of day arising +From sullen earth, sings hymns at heaven's gate; +For thy sweet love remember'd such wealth brings +That then I scorn to change my state with kings. +When to the sessions of sweet silent thought +I summon up remembrance of things past, +I sigh the lack of many a thing I sought, +And with old woes new wail my dear time's waste: +Then can I drown an eye, unused to flow, +For precious friends hid in death's dateless night, +And weep afresh love's long since cancell'd woe, +And moan the expense of many a vanish'd sight: +Then can I grieve at grievances foregone, +And heavily from woe to woe tell o'er +The sad account of fore-bemoaned moan, +Which I new pay as if not paid before. +But if the while I think on thee, dear friend, +All losses are restored and sorrows end. +Thy bosom is endeared with all hearts, +Which I by lacking have supposed dead, +And there reigns love and all love's loving parts, +And all those friends which I thought buried. +How many a holy and obsequious tear +Hath dear religious love stol'n from mine eye +As interest of the dead, which now appear +But things removed that hidden in thee lie! +Thou art the grave where buried love doth live, +Hung with the trophies of my lovers gone, +Who all their parts of me to thee did give; +That due of many now is thine alone: +Their images I loved I view in thee, +And thou, all they, hast all the all of me. +If thou survive my well-contented day, +When that churl Death my bones with dust shall cover, +And shalt by fortune once more re-survey +These poor rude lines of thy deceased lover, +Compare them with the bettering of the time, +And though they be outstripp'd by every pen, +Reserve them for my love, not for their rhyme, +Exceeded by the height of happier men. +O, then vouchsafe me but this loving thought: +'Had my friend's Muse grown with this growing age, +A dearer birth than this his love had brought, +To march in ranks of better equipage: +But since he died and poets better prove, +Theirs for their style I'll read, his for his love.' +Full many a glorious morning have I seen +Flatter the mountain-tops with sovereign eye, +Kissing with golden face the meadows green, +Gilding pale streams with heavenly alchemy; +Anon permit the basest clouds to ride +With ugly rack on his celestial face, +And from the forlorn world his visage hide, +Stealing unseen to west with this disgrace: +Even so my sun one early morn did shine +With all triumphant splendor on my brow; +But out, alack! he was but one hour mine; +The region cloud hath mask'd him from me now. +Yet him for this my love no whit disdaineth; +Suns of the world may stain when heaven's sun staineth. +Why didst thou promise such a beauteous day, +And make me travel forth without my cloak, +To let base clouds o'ertake me in my way, +Hiding thy bravery in their rotten smoke? +'Tis not enough that through the cloud thou break, +To dry the rain on my storm-beaten face, +For no man well of such a salve can speak +That heals the wound and cures not the disgrace: +Nor can thy shame give physic to my grief; +Though thou repent, yet I have still the loss: +The offender's sorrow lends but weak relief +To him that bears the strong offence's cross. +Ah! but those tears are pearl which thy love sheds, +And they are rich and ransom all ill deeds. +No more be grieved at that which thou hast done: +Roses have thorns, and silver fountains mud; +Clouds and eclipses stain both moon and sun, +And loathsome canker lives in sweetest bud. +All men make faults, and even I in this, +Authorizing thy trespass with compare, +Myself corrupting, salving thy amiss, +Excusing thy sins more than thy sins are; +For to thy sensual fault I bring in sense-- +Thy adverse party is thy advocate-- +And 'gainst myself a lawful plea commence: +Such civil war is in my love and hate +That I an accessary needs must be +To that sweet thief which sourly robs from me. +Let me confess that we two must be twain, +Although our undivided loves are one: +So shall those blots that do with me remain +Without thy help by me be borne alone. +In our two loves there is but one respect, +Though in our lives a separable spite, +Which though it alter not love's sole effect, +Yet doth it steal sweet hours from love's delight. +I may not evermore acknowledge thee, +Lest my bewailed guilt should do thee shame, +Nor thou with public kindness honour me, +Unless thou take that honour from thy name: +But do not so; I love thee in such sort +As, thou being mine, mine is thy good report. +As a decrepit father takes delight +To see his active child do deeds of youth, +So I, made lame by fortune's dearest spite, +Take all my comfort of thy worth and truth. +For whether beauty, birth, or wealth, or wit, +Or any of these all, or all, or more, +Entitled in thy parts do crowned sit, +I make my love engrafted to this store: +So then I am not lame, poor, nor despised, +Whilst that this shadow doth such substance give +That I in thy abundance am sufficed +And by a part of all thy glory live. +Look, what is best, that best I wish in thee: +This wish I have; then ten times happy me! \ No newline at end of file diff --git a/vllm-v0.6.2/cmake/cpu_extension.cmake b/vllm-v0.6.2/cmake/cpu_extension.cmake new file mode 100644 index 0000000..5912c5c --- /dev/null +++ b/vllm-v0.6.2/cmake/cpu_extension.cmake @@ -0,0 +1,156 @@ +include(FetchContent) + +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# +# Define environment variables for special configurations +# +if(DEFINED ENV{VLLM_CPU_AVX512BF16}) + set(ENABLE_AVX512BF16 ON) +endif() + +include_directories("${CMAKE_SOURCE_DIR}/csrc") + +# +# Check the compile flags +# +list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-mf16c" + "-DVLLM_CPU_EXTENSION") + +execute_process(COMMAND cat /proc/cpuinfo + RESULT_VARIABLE CPUINFO_RET + OUTPUT_VARIABLE CPUINFO) + +if (NOT CPUINFO_RET EQUAL 0) + message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") +endif() + +function (find_isa CPUINFO TARGET OUT) + string(FIND ${CPUINFO} ${TARGET} ISA_FOUND) + if(NOT ISA_FOUND EQUAL -1) + set(${OUT} ON PARENT_SCOPE) + else() + set(${OUT} OFF PARENT_SCOPE) + endif() +endfunction() + +function (is_avx512_disabled OUT) + set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512}) + if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true") + set(${OUT} ON PARENT_SCOPE) + else() + set(${OUT} OFF PARENT_SCOPE) + endif() +endfunction() + +is_avx512_disabled(AVX512_DISABLED) + +find_isa(${CPUINFO} "avx2" AVX2_FOUND) +find_isa(${CPUINFO} "avx512f" AVX512_FOUND) +find_isa(${CPUINFO} "POWER10" POWER10_FOUND) +find_isa(${CPUINFO} "POWER9" POWER9_FOUND) + +if (AVX512_FOUND AND NOT AVX512_DISABLED) + list(APPEND CXX_COMPILE_FLAGS + "-mavx512f" + "-mavx512vl" + "-mavx512bw" + "-mavx512dq") + + find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND) + if (AVX512BF16_FOUND OR ENABLE_AVX512BF16) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) + list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16") + else() + message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3") + endif() + else() + message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") + endif() +elseif (AVX2_FOUND) + list(APPEND CXX_COMPILE_FLAGS "-mavx2") + message(WARNING "vLLM CPU backend using AVX2 ISA") +elseif (POWER9_FOUND OR POWER10_FOUND) + message(STATUS "PowerPC detected") + # Check for PowerPC VSX support + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=native" + "-mtune=native") +else() + message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") +endif() + +# +# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms) +# +if (AVX512_FOUND AND NOT AVX512_DISABLED) + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG v3.6 + GIT_PROGRESS TRUE + GIT_SHALLOW TRUE + ) + + set(ONEDNN_LIBRARY_TYPE "STATIC") + set(ONEDNN_BUILD_DOC "OFF") + set(ONEDNN_BUILD_EXAMPLES "OFF") + set(ONEDNN_BUILD_TESTS "OFF") + set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") + set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") + set(ONEDNN_BUILD_GRAPH "OFF") + set(ONEDNN_ENABLE_JIT_PROFILING "OFF") + set(ONEDNN_ENABLE_ITT_TASKS "OFF") + set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") + set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + + FetchContent_MakeAvailable(oneDNN) + + list(APPEND LIBS dnnl) +endif() + +message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") + +list(APPEND LIBS numa) + +# +# _C extension +# +set(VLLM_EXT_SRC + "csrc/cpu/activation.cpp" + "csrc/cpu/attention.cpp" + "csrc/cpu/cache.cpp" + "csrc/cpu/utils.cpp" + "csrc/cpu/layernorm.cpp" + "csrc/cpu/pos_encoding.cpp" + "csrc/cpu/torch_bindings.cpp") + +if (AVX512_FOUND AND NOT AVX512_DISABLED) + set(VLLM_EXT_SRC + "csrc/cpu/quant.cpp" + ${VLLM_EXT_SRC}) +endif() + +# +# Define extension targets +# + +define_gpu_extension_target( + _C + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_EXT_SRC} + LIBRARIES ${LIBS} + COMPILE_FLAGS ${CXX_COMPILE_FLAGS} + USE_SABI 3 + WITH_SOABI +) + +message(STATUS "Enabling C extension.") diff --git a/vllm-v0.6.2/cmake/hipify.py b/vllm-v0.6.2/cmake/hipify.py new file mode 100755 index 0000000..340e41c --- /dev/null +++ b/vllm-v0.6.2/cmake/hipify.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +# +# A command line tool for running pytorch's hipify preprocessor on CUDA +# source files. +# +# See https://github.com/ROCm/hipify_torch +# and /utils/hipify/hipify_python.py +# + +import argparse +import os +import shutil + +from torch.utils.hipify.hipify_python import hipify + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # Project directory where all the source + include files live. + parser.add_argument( + "-p", + "--project_dir", + help="The project directory.", + ) + + # Directory where hipified files are written. + parser.add_argument( + "-o", + "--output_dir", + help="The output directory.", + ) + + # Source files to convert. + parser.add_argument("sources", + help="Source files to hipify.", + nargs="*", + default=[]) + + args = parser.parse_args() + + # Limit include scope to project_dir only + includes = [os.path.join(args.project_dir, '*')] + + # Get absolute path for all source files. + extra_files = [os.path.abspath(s) for s in args.sources] + + # Copy sources from project directory to output directory. + # The directory might already exist to hold object files so we ignore that. + shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) + + hipify_result = hipify(project_directory=args.project_dir, + output_directory=args.output_dir, + header_include_dirs=[], + includes=includes, + extra_files=extra_files, + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True) + + hipified_sources = [] + for source in args.sources: + s_abs = os.path.abspath(source) + hipified_s_abs = (hipify_result[s_abs].hipified_path if + (s_abs in hipify_result + and hipify_result[s_abs].hipified_path is not None) + else s_abs) + hipified_sources.append(hipified_s_abs) + + assert (len(hipified_sources) == len(args.sources)) + + # Print hipified source files. + print("\n".join(hipified_sources)) diff --git a/vllm-v0.6.2/cmake/utils.cmake b/vllm-v0.6.2/cmake/utils.cmake new file mode 100644 index 0000000..40430da --- /dev/null +++ b/vllm-v0.6.2/cmake/utils.cmake @@ -0,0 +1,433 @@ +# +# Attempt to find the python package that uses the same python executable as +# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. +# +macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) + file(REAL_PATH ${EXECUTABLE} EXECUTABLE) + set(Python_EXECUTABLE ${EXECUTABLE}) + find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule) + if (NOT Python_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() + set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) + message(FATAL_ERROR + "Python version (${_VER}) is not one of the supported versions: " + "${_SUPPORTED_VERSIONS_LIST}.") + endif() + message(STATUS "Found python matching: ${EXECUTABLE}.") +endmacro() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. +# +function (run_python OUT EXPR ERR_MSG) + execute_process( + COMMAND + "${Python_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") + endif() + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) +endfunction() + +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. +macro (append_cmake_prefix_path PKG EXPR) + run_python(_PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") + list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) +endmacro() + +# +# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set +# of CUDA source files. The names of the corresponding "hipified" sources are +# stored in `OUT_SRCS`. +# +function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) + # + # Split into C++ and non-C++ (i.e. CUDA) sources. + # + set(SRCS ${ORIG_SRCS}) + set(CXX_SRCS ${ORIG_SRCS}) + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + + # + # Generate ROCm/HIP source file names from CUDA file names. + # Since HIP files are generated code, they will appear in the build area + # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. + # + set(HIP_SRCS) + foreach (SRC ${SRCS}) + string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) + string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) + list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") + endforeach() + + set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) + add_custom_target( + hipify${NAME} + COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} + BYPRODUCTS ${HIP_SRCS} + COMMENT "Running hipify on ${NAME} extension source files.") + + # Swap out original extension sources with hipified sources. + list(APPEND HIP_SRCS ${CXX_SRCS}) + set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) +endfunction() + +# +# Get additional GPU compiler flags from torch. +# +function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) + if (${GPU_LANG} STREQUAL "CUDA") + # + # Get common NVCC flags from torch. + # + run_python(GPU_FLAGS + "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") + + if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) + list(APPEND GPU_FLAGS "-DENABLE_FP8") + endif() + if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0) + list(REMOVE_ITEM GPU_FLAGS + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__") + endif() + + elseif(${GPU_LANG} STREQUAL "HIP") + # + # Get common HIP/HIPCC flags from torch. + # + run_python(GPU_FLAGS + "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") + + list(APPEND GPU_FLAGS + "-DUSE_ROCM" + "-DENABLE_FP8" + "-U__HIP_NO_HALF_CONVERSIONS__" + "-U__HIP_NO_HALF_OPERATORS__" + "-fno-gpu-rdc") + + endif() + set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) +endfunction() + +# Macro for converting a `gencode` version number to a cmake version number. +macro(string_to_ver OUT_VER IN_STR) + string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) +endmacro() + +# +# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in +# `CUDA_ARCH_FLAGS`. +# +# Example: +# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" +# clear_cuda_arches(CUDA_ARCH_FLAGS) +# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75" +# CMAKE_CUDA_FLAGS="-Wall" +# +macro(clear_cuda_arches CUDA_ARCH_FLAGS) + # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS` + string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS + ${CMAKE_CUDA_FLAGS}) + + # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified + # and passed back via the `CUDA_ARCHITECTURES` property. + string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS + ${CMAKE_CUDA_FLAGS}) +endmacro() + +# +# Extract unique CUDA architectures from a list of compute capabilities codes in +# the form `[]`, convert them to the form sort +# `.`, dedupes them and then sorts them in ascending order and +# stores them in `OUT_ARCHES`. +# +# Example: +# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" +# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS) +# OUT_ARCHES="7.5;...;9.0" +function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS) + set(_CUDA_ARCHES) + foreach(_ARCH ${CUDA_ARCH_FLAGS}) + string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH}) + if (_COMPUTE) + set(_COMPUTE ${CMAKE_MATCH_1}) + endif() + + string_to_ver(_COMPUTE_VER ${_COMPUTE}) + list(APPEND _CUDA_ARCHES ${_COMPUTE_VER}) + endforeach() + + list(REMOVE_DUPLICATES _CUDA_ARCHES) + list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING) + set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE) +endfunction() + +# +# For a specific file set the `-gencode` flag in compile options conditionally +# for the CUDA language. +# +# Example: +# set_gencode_flag_for_srcs( +# SRCS "foo.cu" +# ARCH "compute_75" +# CODE "sm_75") +# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for +# `foo.cu` (only for the CUDA language). +# +macro(set_gencode_flag_for_srcs) + set(options) + set(oneValueArgs ARCH CODE) + set(multiValueArgs SRCS) + cmake_parse_arguments(arg "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE}) + set_property( + SOURCE ${arg_SRCS} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${_FLAG}>" + ) + + message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}") +endmacro(set_gencode_flag_for_srcs) + +# +# For a list of source files set the `-gencode` flags in the files specific +# compile options (specifically for the CUDA language). +# +# arguments are: +# SRCS: list of source files +# CUDA_ARCHS: list of CUDA architectures in the form `.[letter]` +# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built +# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS +# that is larger than BUILD_PTX_FOR_ARCH. +# +macro(set_gencode_flags_for_srcs) + set(options) + set(oneValueArgs BUILD_PTX_FOR_ARCH) + set(multiValueArgs SRCS CUDA_ARCHS) + cmake_parse_arguments(arg "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + foreach(_ARCH ${arg_CUDA_ARCHS}) + string(REPLACE "." "" _ARCH "${_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_ARCH}" + CODE "sm_${_ARCH}") + endforeach() + + if (${arg_BUILD_PTX_FOR_ARCH}) + list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH) + if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH}) + string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_PTX_ARCH}" + CODE "compute_${_PTX_ARCH}") + endif() + endif() +endmacro() + +# +# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form +# `.[letter]` compute the "loose intersection" with the +# `TGT_CUDA_ARCHS` list of gencodes. +# The loose intersection is defined as: +# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } +# where `<=` is the version comparison operator. +# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version +# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. +# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is +# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add +# 9.0a to the result. +# The result is stored in `OUT_CUDA_ARCHS`. +# +# Example: +# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a" +# TGT_CUDA_ARCHS="8.0;8.9;9.0" +# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) +# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a" +# +function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) + list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) + + # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should + # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS + set(_CUDA_ARCHS) + if ("9.0a" IN_LIST SRC_CUDA_ARCHS) + list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") + if ("9.0" IN_LIST TGT_CUDA_ARCHS) + set(_CUDA_ARCHS "9.0a") + endif() + endif() + + list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + + # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is + # less or eqault to ARCH + foreach(_ARCH ${CUDA_ARCHS}) + set(_TMP_ARCH) + foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + set(_TMP_ARCH ${_SRC_ARCH}) + else() + break() + endif() + endforeach() + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS ${_TMP_ARCH}) + endif() + endforeach() + + list(REMOVE_DUPLICATES _CUDA_ARCHS) + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) +endfunction() + +# +# Override the GPU architectures detected by cmake/torch and filter them by +# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in +# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set +# the architectures on a per file basis. +# +# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. +# +macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) + set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) + message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}") + + if (${GPU_LANG} STREQUAL "HIP") + # + # `GPU_ARCHES` controls the `--offload-arch` flags. + # + # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, + # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling + # "rocm_agent_enumerator" in "enable_language(HIP)" + # (in file Modules/CMakeDetermineHIPCompiler.cmake) + # + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) + else() + set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) + endif() + # + # Find the intersection of the supported + detected architectures to + # set the module architecture flags. + # + set(${GPU_ARCHES}) + foreach (_ARCH ${HIP_ARCHITECTURES}) + if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) + list(APPEND ${GPU_ARCHES} ${_ARCH}) + endif() + endforeach() + + if(NOT ${GPU_ARCHES}) + message(FATAL_ERROR + "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" + " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") + endif() + endif() +endmacro() + +# +# Define a target named `GPU_MOD_NAME` for a single extension. The +# arguments are: +# +# DESTINATION - Module destination directory. +# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, +# etc. +# SOURCES - List of source files relative to CMakeLists.txt +# directory. +# +# Optional arguments: +# +# ARCHITECTURES - A list of target GPU architectures in cmake +# format. +# Refer `CMAKE_CUDA_ARCHITECTURES` documentation +# and `CMAKE_HIP_ARCHITECTURES` for more info. +# ARCHITECTURES will use cmake's defaults if +# not provided. +# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. +# INCLUDE_DIRECTORIES - Extra include directories. +# LIBRARIES - Extra link libraries. +# WITH_SOABI - Generate library with python SOABI suffix name. +# USE_SABI - Use python stable api +# +# Note: optimization level/debug info is set via cmake build type. +# +function (define_gpu_extension_target GPU_MOD_NAME) + cmake_parse_arguments(PARSE_ARGV 1 + GPU + "WITH_SOABI" + "DESTINATION;LANGUAGE;USE_SABI" + "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") + + # Add hipify preprocessing step when building with HIP/ROCm. + if (GPU_LANGUAGE STREQUAL "HIP") + hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") + endif() + + if (GPU_WITH_SOABI) + set(GPU_WITH_SOABI WITH_SOABI) + else() + set(GPU_WITH_SOABI) + endif() + + if (GPU_USE_SABI) + Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}") + else() + Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}") + endif() + + if (GPU_LANGUAGE STREQUAL "HIP") + # Make this target dependent on the hipify preprocessor step. + add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) + endif() + + if (GPU_ARCHITECTURES) + set_target_properties(${GPU_MOD_NAME} PROPERTIES + ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") + endif() + + set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) + + target_compile_options(${GPU_MOD_NAME} PRIVATE + $<$:${GPU_COMPILE_FLAGS}>) + + target_compile_definitions(${GPU_MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") + + target_include_directories(${GPU_MOD_NAME} PRIVATE csrc + ${GPU_INCLUDE_DIRECTORIES}) + + target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) + + # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of + # dependencies that are not necessary and may not be installed. + if (GPU_LANGUAGE STREQUAL "CUDA") + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) + else() + target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + endif() + + install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME}) +endfunction() diff --git a/vllm-v0.6.2/collect_env.py b/vllm-v0.6.2/collect_env.py new file mode 100644 index 0000000..254c19b --- /dev/null +++ b/vllm-v0.6.2/collect_env.py @@ -0,0 +1,765 @@ +# ruff: noqa +# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py + +import datetime +import locale +import os +import re +import subprocess +import sys +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` +from collections import namedtuple + +from vllm.envs import environment_variables + +try: + import torch + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple( + 'SystemEnv', + [ + 'torch_version', + 'is_debug_build', + 'cuda_compiled_version', + 'gcc_version', + 'clang_version', + 'cmake_version', + 'os', + 'libc_version', + 'python_version', + 'python_platform', + 'is_cuda_available', + 'cuda_runtime_version', + 'cuda_module_loading', + 'nvidia_driver_version', + 'nvidia_gpu_models', + 'cudnn_version', + 'pip_version', # 'pip' or 'pip3' + 'pip_packages', + 'conda_packages', + 'hip_compiled_version', + 'hip_runtime_version', + 'miopen_runtime_version', + 'caching_allocator_config', + 'is_xnnpack_available', + 'cpu_info', + 'rocm_version', # vllm specific field + 'neuron_sdk_version', # vllm specific field + 'vllm_version', # vllm specific field + 'vllm_build_flags', # vllm specific field + 'gpu_topo', # vllm specific field + 'env_vars', + ]) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "cudatoolkit", + "soumith", + "mkl", + "magma", + "triton", + "optree", + "nccl", + "transformers", + "zmq", + "nvidia", + "pynvml", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "triton", + "optree", + "onnx", + "nccl", + "transformers", + "zmq", + "nvidia", + "pynvml", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + p = subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == 'win32': + enc = 'oem' + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + err = raw_err.decode(enc) + return rc, output.strip(), err.strip() + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split('\n')[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get('CONDA_EXE', 'conda') + out = run_and_read_all(run_lambda, "{} list".format(conda)) + if out is None: + return out + + return "\n".join(line for line in out.splitlines() + if not line.startswith("#") and any(name in line + for name in patterns)) + + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)') + + +def get_clang_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'clang --version', + r'clang version (.*)') + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'cmake --version', + r'cmake (.*)') + + +def get_nvidia_driver_version(run_lambda): + if get_platform() == 'darwin': + cmd = 'kextstat | grep -i cuda' + return run_and_parse_first_match(run_lambda, cmd, + r'com[.]nvidia[.]CUDA [(](.*?)[)]') + smi = get_nvidia_smi() + return run_and_parse_first_match(run_lambda, smi, + r'Driver Version: (.*?) ') + + +def get_gpu_info(run_lambda): + if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr( + torch.version, 'hip') and torch.version.hip is not None): + if TORCH_AVAILABLE and torch.cuda.is_available(): + if torch.version.hip is not None: + prop = torch.cuda.get_device_properties(0) + if hasattr(prop, "gcnArchName"): + gcnArch = " ({})".format(prop.gcnArchName) + else: + gcnArch = "NoGCNArchNameOnOldPyTorch" + else: + gcnArch = "" + return torch.cuda.get_device_name(None) + gcnArch + return None + smi = get_nvidia_smi() + uuid_regex = re.compile(r' \(UUID: .+?\)') + rc, out, _ = run_lambda(smi + ' -L') + if rc != 0: + return None + # Anonymize GPUs by removing their UUID + return re.sub(uuid_regex, '', out) + + +def get_running_cuda_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'nvcc --version', + r'release .+ V(.*)') + + +def get_cudnn_version(run_lambda): + """Return a list of libcudnn.so; it's hard to tell which one is being used.""" + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%") + where_cmd = os.path.join(system_root, 'System32', 'where') + cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) + elif get_platform() == 'darwin': + # CUDA libraries and drivers can be found in /usr/local/cuda/. See + # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install + # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. + cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*' + else: + cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' + rc, out, _ = run_lambda(cudnn_cmd) + # find will return 1 if there are permission errors or if not found + if len(out) == 0 or (rc != 1 and rc != 0): + l = os.environ.get('CUDNN_LIBRARY') + if l is not None and os.path.isfile(l): + return os.path.realpath(l) + return None + files_set = set() + for fn in out.split('\n'): + fn = os.path.realpath(fn) # eliminate symbolic links + if os.path.isfile(fn): + files_set.add(fn) + if not files_set: + return None + # Alphabetize the result because the order is non-deterministic otherwise + files = sorted(files_set) + if len(files) == 1: + return files[0] + result = '\n'.join(files) + return 'Probably one of the following:\n{}'.format(result) + + +def get_nvidia_smi(): + # Note: nvidia-smi is currently available only on Windows and Linux + smi = 'nvidia-smi' + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + program_files_root = os.environ.get('PROGRAMFILES', + 'C:\\Program Files') + legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', + 'NVSMI', smi) + new_path = os.path.join(system_root, 'System32', smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = '"{}"'.format(candidate_smi) + break + return smi + + +def get_rocm_version(run_lambda): + """Returns the ROCm version if available, otherwise 'N/A'.""" + return run_and_parse_first_match(run_lambda, 'hipcc --version', + r'HIP version: (\S+)') + + +def get_neuron_sdk_version(run_lambda): + # Adapted from your install script + try: + result = run_lambda(["neuron-ls"]) + return result if result[0] == 0 else 'N/A' + except Exception: + return 'N/A' + + +def get_vllm_version(): + from vllm import __version__, __version_tuple__ + + if __version__ == "dev": + return "N/A (dev)" + + if len(__version_tuple__) == 4: # dev build + git_sha = __version_tuple__[-1][1:] # type: ignore + return f"{__version__} (git sha: {git_sha}" + + return __version__ + +def summarize_vllm_build_flags(): + # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. + return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( + os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'), + 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled', + 'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled', + ) + + +def get_gpu_topo(run_lambda): + output = None + + if get_platform() == 'linux': + output = run_and_read_all(run_lambda, 'nvidia-smi topo -m') + if output is None: + output = run_and_read_all(run_lambda, 'rocm-smi --showtopo') + + return output + + +# example outputs of CPU infos +# * linux +# Architecture: x86_64 +# CPU op-mode(s): 32-bit, 64-bit +# Address sizes: 46 bits physical, 48 bits virtual +# Byte Order: Little Endian +# CPU(s): 128 +# On-line CPU(s) list: 0-127 +# Vendor ID: GenuineIntel +# Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# CPU family: 6 +# Model: 106 +# Thread(s) per core: 2 +# Core(s) per socket: 32 +# Socket(s): 2 +# Stepping: 6 +# BogoMIPS: 5799.78 +# Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr +# sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl +# xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 +# pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand +# hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced +# fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap +# avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 +# xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq +# avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities +# Virtualization features: +# Hypervisor vendor: KVM +# Virtualization type: full +# Caches (sum of all): +# L1d: 3 MiB (64 instances) +# L1i: 2 MiB (64 instances) +# L2: 80 MiB (64 instances) +# L3: 108 MiB (2 instances) +# NUMA: +# NUMA node(s): 2 +# NUMA node0 CPU(s): 0-31,64-95 +# NUMA node1 CPU(s): 32-63,96-127 +# Vulnerabilities: +# Itlb multihit: Not affected +# L1tf: Not affected +# Mds: Not affected +# Meltdown: Not affected +# Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +# Retbleed: Not affected +# Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +# Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +# Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence +# Srbds: Not affected +# Tsx async abort: Not affected +# * win32 +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU0 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 +# +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU1 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 + + +def get_cpu_info(run_lambda): + rc, out, err = 0, '', '' + if get_platform() == 'linux': + rc, out, err = run_lambda('lscpu') + elif get_platform() == 'win32': + rc, out, err = run_lambda( + 'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE' + ) + elif get_platform() == 'darwin': + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = 'None' + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith('linux'): + return 'linux' + elif sys.platform.startswith('win32'): + return 'win32' + elif sys.platform.startswith('cygwin'): + return 'cygwin' + elif sys.platform.startswith('darwin'): + return 'darwin' + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', + r'(.*)') + + +def get_windows_version(run_lambda): + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic') + findstr_cmd = os.path.join(system_root, 'System32', 'findstr') + return run_and_read_all( + run_lambda, + '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd)) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'lsb_release -a', + r'Description:\t(.*)') + + +def check_release_file(run_lambda): + return run_and_parse_first_match(run_lambda, 'cat /etc/*-release', + r'PRETTY_NAME="(.*)"') + + +def get_os(run_lambda): + from platform import machine + platform = get_platform() + + if platform == 'win32' or platform == 'cygwin': + return get_windows_version(run_lambda) + + if platform == 'darwin': + version = get_mac_version(run_lambda) + if version is None: + return None + return 'macOS {} ({})'.format(version, machine()) + + if platform == 'linux': + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + return '{} ({})'.format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + return platform.platform() + + +def get_libc_version(): + import platform + if get_platform() != 'linux': + return 'N/A' + return '-'.join(platform.libc_ver()) + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + # People generally have `pip` as `pip` or `pip3` + # But here it is invoked as `python -mpip` + def run_with_pip(pip): + out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"]) + return "\n".join(line for line in out.splitlines() + if any(name in line for name in patterns)) + + pip_version = 'pip3' if sys.version[0] == '3' else 'pip' + out = run_with_pip([sys.executable, '-mpip']) + + return pip_version, out + + +def get_cachingallocator_config(): + ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '') + return ca_config + + +def get_cuda_module_loading_config(): + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.init() + config = os.environ.get('CUDA_MODULE_LOADING', '') + return config + else: + return "N/A" + + +def is_xnnpack_available(): + if TORCH_AVAILABLE: + import torch.backends.xnnpack + return str( + torch.backends.xnnpack.enabled) # type: ignore[attr-defined] + else: + return "N/A" + +def get_env_vars(): + env_vars = '' + secret_terms=('secret', 'token', 'api', 'access', 'password') + report_prefix = ("TORCH", "NCCL", "PYTORCH", + "CUDA", "CUBLAS", "CUDNN", + "OMP_", "MKL_", + "NVIDIA") + for k, v in os.environ.items(): + if any(term in k.lower() for term in secret_terms): + continue + if k in environment_variables: + env_vars = env_vars + "{}={}".format(k, v) + "\n" + if k.startswith(report_prefix): + env_vars = env_vars + "{}={}".format(k, v) + "\n" + + return env_vars + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + cuda_available_str = str(torch.cuda.is_available()) + cuda_version_str = torch.version.cuda + if not hasattr(torch.version, + 'hip') or torch.version.hip is None: # cuda version + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + else: # HIP version + + def get_version_or_na(cfg, prefix): + _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] + return _lst[0] if _lst else 'N/A' + + cfg = torch._C._show_config().split('\n') + hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime') + miopen_runtime_version = get_version_or_na(cfg, 'MIOpen') + cuda_version_str = 'N/A' + hip_compiled_version = torch.version.hip + else: + version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A' + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + rocm_version = get_rocm_version(run_lambda) + neuron_sdk_version = get_neuron_sdk_version(run_lambda) + vllm_version = get_vllm_version() + vllm_build_flags = summarize_vllm_build_flags() + gpu_topo = get_gpu_topo(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version='{} ({}-bit runtime)'.format( + sys_version, + sys.maxsize.bit_length() + 1), + python_platform=get_python_platform(), + is_cuda_available=cuda_available_str, + cuda_compiled_version=cuda_version_str, + cuda_runtime_version=get_running_cuda_version(run_lambda), + cuda_module_loading=get_cuda_module_loading_config(), + nvidia_gpu_models=get_gpu_info(run_lambda), + nvidia_driver_version=get_nvidia_driver_version(run_lambda), + cudnn_version=get_cudnn_version(run_lambda), + hip_compiled_version=hip_compiled_version, + hip_runtime_version=hip_runtime_version, + miopen_runtime_version=miopen_runtime_version, + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + caching_allocator_config=get_cachingallocator_config(), + is_xnnpack_available=is_xnnpack_available(), + cpu_info=get_cpu_info(run_lambda), + rocm_version=rocm_version, + neuron_sdk_version=neuron_sdk_version, + vllm_version=vllm_version, + vllm_build_flags=vllm_build_flags, + gpu_topo=gpu_topo, + env_vars=get_env_vars(), + ) + + +env_info_fmt = """ +PyTorch version: {torch_version} +Is debug build: {is_debug_build} +CUDA used to build PyTorch: {cuda_compiled_version} +ROCM used to build PyTorch: {hip_compiled_version} + +OS: {os} +GCC version: {gcc_version} +Clang version: {clang_version} +CMake version: {cmake_version} +Libc version: {libc_version} + +Python version: {python_version} +Python platform: {python_platform} +Is CUDA available: {is_cuda_available} +CUDA runtime version: {cuda_runtime_version} +CUDA_MODULE_LOADING set to: {cuda_module_loading} +GPU models and configuration: {nvidia_gpu_models} +Nvidia driver version: {nvidia_driver_version} +cuDNN version: {cudnn_version} +HIP runtime version: {hip_runtime_version} +MIOpen runtime version: {miopen_runtime_version} +Is XNNPACK available: {is_xnnpack_available} + +CPU: +{cpu_info} + +Versions of relevant libraries: +{pip_packages} +{conda_packages} +""".strip() + +# both the above code and the following code use `strip()` to +# remove leading/trailing whitespaces, so we need to add a newline +# in between to separate the two sections +env_info_fmt += "\n" + +env_info_fmt += """ +ROCM Version: {rocm_version} +Neuron SDK Version: {neuron_sdk_version} +vLLM Version: {vllm_version} +vLLM Build Flags: +{vllm_build_flags} +GPU Topology: +{gpu_topo} + +{env_vars} +""".strip() + + +def pretty_str(envinfo): + + def replace_nones(dct, replacement='Could not collect'): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true='Yes', false='No'): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag='[prepend]'): + lines = text.split('\n') + updated_lines = [tag + line for line in lines] + return '\n'.join(updated_lines) + + def replace_if_empty(text, replacement='No relevant packages'): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split('\n')) > 1: + return '\n{}\n'.format(string) + return string + + mutable_dict = envinfo._asdict() + + # If nvidia_gpu_models is multiline, start on the next line + mutable_dict['nvidia_gpu_models'] = \ + maybe_start_on_next_line(envinfo.nvidia_gpu_models) + + # If the machine doesn't have CUDA, report some fields as 'No CUDA' + dynamic_cuda_fields = [ + 'cuda_runtime_version', + 'nvidia_gpu_models', + 'nvidia_driver_version', + ] + all_cuda_fields = dynamic_cuda_fields + ['cudnn_version'] + all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None + for field in dynamic_cuda_fields) + if TORCH_AVAILABLE and not torch.cuda.is_available( + ) and all_dynamic_cuda_fields_missing: + for field in all_cuda_fields: + mutable_dict[field] = 'No CUDA' + if envinfo.cuda_compiled_version is None: + mutable_dict['cuda_compiled_version'] = 'None' + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict['pip_packages'] = replace_if_empty( + mutable_dict['pip_packages']) + mutable_dict['conda_packages'] = replace_if_empty( + mutable_dict['conda_packages']) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict['pip_packages']: + mutable_dict['pip_packages'] = prepend( + mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version)) + if mutable_dict['conda_packages']: + mutable_dict['conda_packages'] = prepend( + mutable_dict['conda_packages'], '[conda] ') + mutable_dict['cpu_info'] = envinfo.cpu_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr( + torch.utils, '_crash_handler'): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [ + os.path.join(minidump_dir, dump) + for dump in os.listdir(minidump_dir) + ] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime( + '%Y-%m-%d %H:%M:%S') + msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \ + "if this is related to your bug please include it when you file a report ***" + print(msg, file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/vllm-v0.6.2/docs/Makefile b/vllm-v0.6.2/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/vllm-v0.6.2/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/vllm-v0.6.2/docs/README.md b/vllm-v0.6.2/docs/README.md new file mode 100644 index 0000000..46488c9 --- /dev/null +++ b/vllm-v0.6.2/docs/README.md @@ -0,0 +1,19 @@ +# vLLM documents + +## Build the docs + +```bash +# Install dependencies. +pip install -r requirements-docs.txt + +# Build the docs. +make clean +make html +``` + +## Open the docs with your browser + +```bash +python -m http.server -d build/html/ +``` +Launch your browser and open localhost:8000. diff --git a/vllm-v0.6.2/docs/make.bat b/vllm-v0.6.2/docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/vllm-v0.6.2/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/vllm-v0.6.2/docs/requirements-docs.txt b/vllm-v0.6.2/docs/requirements-docs.txt new file mode 100644 index 0000000..e3e3584 --- /dev/null +++ b/vllm-v0.6.2/docs/requirements-docs.txt @@ -0,0 +1,19 @@ +sphinx==6.2.1 +sphinx-book-theme==1.0.1 +sphinx-copybutton==0.5.2 +myst-parser==2.0.0 +sphinx-argparse==0.4.0 +msgspec +cloudpickle + +# packages to install to build the documentation +pydantic >= 2.8 +-f https://download.pytorch.org/whl/cpu +torch +py-cpuinfo +transformers +mistral_common >= 1.3.4 +aiohttp +starlette +openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args +partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/_static/custom.js b/vllm-v0.6.2/docs/source/_static/custom.js new file mode 100644 index 0000000..18b502c --- /dev/null +++ b/vllm-v0.6.2/docs/source/_static/custom.js @@ -0,0 +1,18 @@ +document.addEventListener("DOMContentLoaded", function () { + var script = document.createElement("script"); + script.type = "module"; + script.id = "runllm-widget-script" + + script.src = "https://widget.runllm.com"; + + script.setAttribute("version", "stable"); + script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. + script.setAttribute("runllm-name", "vLLM"); + script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-position-y", "20%"); + script.setAttribute("runllm-position-x", "3%"); + script.setAttribute("runllm-assistant-id", "207"); + + script.async = true; + document.head.appendChild(script); + }); \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/_templates/sections/header.html b/vllm-v0.6.2/docs/source/_templates/sections/header.html new file mode 100644 index 0000000..7174431 --- /dev/null +++ b/vllm-v0.6.2/docs/source/_templates/sections/header.html @@ -0,0 +1,39 @@ + + +
+

You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.

+
diff --git a/vllm-v0.6.2/docs/source/assets/design/hierarchy.png b/vllm-v0.6.2/docs/source/assets/design/hierarchy.png new file mode 100644 index 0000000..6a1b4ba Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/design/hierarchy.png differ diff --git a/vllm-v0.6.2/docs/source/assets/dev/dockerfile-stages-dependency.png b/vllm-v0.6.2/docs/source/assets/dev/dockerfile-stages-dependency.png new file mode 100644 index 0000000..b016531 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/dev/dockerfile-stages-dependency.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/k_vecs.png b/vllm-v0.6.2/docs/source/assets/kernel/k_vecs.png new file mode 100644 index 0000000..4b7be13 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/k_vecs.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/key.png b/vllm-v0.6.2/docs/source/assets/kernel/key.png new file mode 100644 index 0000000..2059b60 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/key.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/logits_vec.png b/vllm-v0.6.2/docs/source/assets/kernel/logits_vec.png new file mode 100644 index 0000000..373eea4 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/logits_vec.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/q_vecs.png b/vllm-v0.6.2/docs/source/assets/kernel/q_vecs.png new file mode 100644 index 0000000..f55b374 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/q_vecs.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/query.png b/vllm-v0.6.2/docs/source/assets/kernel/query.png new file mode 100644 index 0000000..e2d15eb Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/query.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/v_vec.png b/vllm-v0.6.2/docs/source/assets/kernel/v_vec.png new file mode 100644 index 0000000..75d344a Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/v_vec.png differ diff --git a/vllm-v0.6.2/docs/source/assets/kernel/value.png b/vllm-v0.6.2/docs/source/assets/kernel/value.png new file mode 100644 index 0000000..56b0b9e Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/kernel/value.png differ diff --git a/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-only-light.png b/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-only-light.png new file mode 100644 index 0000000..7aaf174 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-only-light.png differ diff --git a/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-text-dark.png b/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-text-dark.png new file mode 100644 index 0000000..959a42f Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-text-dark.png differ diff --git a/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-text-light.png b/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-text-light.png new file mode 100644 index 0000000..1ead997 Binary files /dev/null and b/vllm-v0.6.2/docs/source/assets/logos/vllm-logo-text-light.png differ diff --git a/vllm-v0.6.2/docs/source/automatic_prefix_caching/apc.rst b/vllm-v0.6.2/docs/source/automatic_prefix_caching/apc.rst new file mode 100644 index 0000000..0d70c74 --- /dev/null +++ b/vllm-v0.6.2/docs/source/automatic_prefix_caching/apc.rst @@ -0,0 +1,110 @@ +.. _apc: + +Introduction +============ + +What is Automatic Prefix Caching +-------------------------------- + +Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. + + +.. note:: + + Technical details on how vLLM implements APC are in the next page. + + + +Enabling APC in vLLM +-------------------- + +Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example: + +.. code-block:: python + + import time + from vllm import LLM, SamplingParams + + + # A prompt containing a large markdown table. The table is randomly generated by GPT-4. + LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ + | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | + |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| + | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | + | 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | + | 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | + | 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | + | 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | + | 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | + | 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | + | 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | + | 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | + | 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| + | 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | + | 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | + | 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | + | 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | + | 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | + | 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | + | 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | + | 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | + | 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | + | 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | + | 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | + | 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | + | 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| + | 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | + | 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | + | 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | + | 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | + | 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | + | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | + | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | + """ + + + def get_generation_time(llm, sampling_params, prompts): + # time the generation + start_time = time.time() + output = llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + # print the output and generation time + print(f"Output: {output[0].outputs[0].text}") + print(f"Generation time: {end_time - start_time} seconds.") + + + # set enable_prefix_caching=True to enable APC + llm = LLM( + model='lmsys/longchat-13b-16k', + enable_prefix_caching=True + ) + + sampling_params = SamplingParams(temperature=0, max_tokens=100) + + # Querying the age of John Doe + get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", + ) + + # Querying the age of Zack Blue + # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. + get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", + ) + +Example workloads +----------------- + +We describe two example workloads, where APC can provide huge performance benefit: + +- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. +- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. + + +Limits +------ +APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/vllm-v0.6.2/docs/source/automatic_prefix_caching/details.md b/vllm-v0.6.2/docs/source/automatic_prefix_caching/details.md new file mode 100644 index 0000000..2d3214e --- /dev/null +++ b/vllm-v0.6.2/docs/source/automatic_prefix_caching/details.md @@ -0,0 +1,43 @@ +# Implementation + +The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. + +To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. + +``` + Block 1 Block 2 Block 3 + [A gentle breeze stirred] [the leaves as children] [laughed in the distance] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| +``` + + +In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: + +``` +hash(prefix tokens + block tokens) <--> KV Block +``` + +With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. + + +This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. + + +# Generalized Caching Policy + +Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. + +Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy: + +* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0. +* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU). +* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it). + +Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree. + +However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: + +- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/vllm-v0.6.2/docs/source/community/meetups.rst b/vllm-v0.6.2/docs/source/community/meetups.rst new file mode 100644 index 0000000..c87f01a --- /dev/null +++ b/vllm-v0.6.2/docs/source/community/meetups.rst @@ -0,0 +1,16 @@ +.. _meetups: + +vLLM Meetups +============ + +We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: + +- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__ +- `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ +- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ +- `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ +- `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ +- `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ +- `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__ + +We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu `__. diff --git a/vllm-v0.6.2/docs/source/community/sponsors.md b/vllm-v0.6.2/docs/source/community/sponsors.md new file mode 100644 index 0000000..52fbf9a --- /dev/null +++ b/vllm-v0.6.2/docs/source/community/sponsors.md @@ -0,0 +1,29 @@ +# Sponsors + +vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! + + + + +- a16z +- AMD +- Anyscale +- AWS +- Crusoe Cloud +- Databricks +- DeepInfra +- Dropbox +- Google Cloud +- Lambda Lab +- NVIDIA +- Replicate +- Roblox +- RunPod +- Sequoia Capital +- Skywork AI +- Trainy +- UC Berkeley +- UC San Diego +- ZhenFund + +We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/vllm-v0.6.2/docs/source/conf.py b/vllm-v0.6.2/docs/source/conf.py new file mode 100644 index 0000000..96ad9a4 --- /dev/null +++ b/vllm-v0.6.2/docs/source/conf.py @@ -0,0 +1,156 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import logging +import os +import sys +from typing import List + +from sphinx.ext import autodoc + +logger = logging.getLogger(__name__) +sys.path.append(os.path.abspath("../..")) + +# -- Project information ----------------------------------------------------- + +project = 'vLLM' +copyright = '2024, vLLM Team' +author = 'the vLLM Team' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.intersphinx", + "sphinx_copybutton", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "myst_parser", + "sphinxarg.ext", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns: List[str] = ["**/*.template.rst"] + +# Exclude the prompt "$" when copying code +copybutton_prompt_text = r"\$ " +copybutton_prompt_is_regexp = True + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_title = project +html_theme = 'sphinx_book_theme' +html_logo = 'assets/logos/vllm-logo-text-light.png' +html_theme_options = { + 'path_to_docs': 'docs/source', + 'repository_url': 'https://github.com/vllm-project/vllm', + 'use_repository_button': True, + 'use_edit_page_button': True, +} +html_static_path = ["_static"] +html_js_files = ["custom.js"] + +# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa +READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') +if READTHEDOCS_VERSION_TYPE == "tag": + # remove the warning banner if the version is a tagged release + header_file = os.path.join(os.path.dirname(__file__), + "_templates/sections/header.html") + # The file might be removed already if the build is triggered multiple times + # (readthedocs build both HTML and PDF versions separately) + if os.path.exists(header_file): + os.remove(header_file) + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + + +# Generate additional rst documentation here. +def setup(app): + from docs.source.generate_examples import generate_examples + generate_examples() + + +# Mock out external dependencies here, otherwise the autodoc pages may be blank. +autodoc_mock_imports = [ + "compressed_tensors", + "cpuinfo", + "cv2", + "torch", + "transformers", + "psutil", + "prometheus_client", + "sentencepiece", + "vllm._C", + "PIL", + "numpy", + 'triton', + "tqdm", + "tensorizer", + "pynvml", + "outlines", + "librosa", + "soundfile", + "gguf", + "lark", + "decord", +] + +for mock_target in autodoc_mock_imports: + if mock_target in sys.modules: + logger.info( + "Potentially problematic mock target (%s) found; " + "autodoc_mock_imports cannot mock modules that have already " + "been loaded into sys.modules when the sphinx build starts.", + mock_target) + + +class MockedClassDocumenter(autodoc.ClassDocumenter): + """Remove note about base class when a class is derived from object.""" + + def add_line(self, line: str, source: str, *lineno: int) -> None: + if line == " Bases: :py:class:`object`": + return + super().add_line(line, source, *lineno) + + +autodoc.ClassDocumenter = MockedClassDocumenter + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "typing_extensions": + ("https://typing-extensions.readthedocs.io/en/latest", None), + "aiohttp": ("https://docs.aiohttp.org/en/stable", None), + "pillow": ("https://pillow.readthedocs.io/en/stable", None), + "numpy": ("https://numpy.org/doc/stable", None), + "torch": ("https://pytorch.org/docs/stable", None), + "psutil": ("https://psutil.readthedocs.io/en/stable", None), +} + +autodoc_preserve_defaults = True +autodoc_warningiserror = True + +navigation_with_keys = False diff --git a/vllm-v0.6.2/docs/source/contributing/dockerfile/dockerfile.rst b/vllm-v0.6.2/docs/source/contributing/dockerfile/dockerfile.rst new file mode 100644 index 0000000..9c17c27 --- /dev/null +++ b/vllm-v0.6.2/docs/source/contributing/dockerfile/dockerfile.rst @@ -0,0 +1,50 @@ +Dockerfile +==================== + +See `here `__ for the main Dockerfile to construct +the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__. + +Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: + +- All build stages +- The default build target (highlighted in grey) +- External images (with dashed borders) + +The edges of the build graph represent: + +- FROM ... dependencies (with a solid line and a full arrow head) +- COPY --from=... dependencies (with a dashed line and an empty arrow head) +- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) + + .. figure:: ../../assets/dev/dockerfile-stages-dependency.png + :alt: query + :width: 100% + :align: center + + Made using: https://github.com/patrickhoefler/dockerfilegraph + + Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present): + + .. code:: bash + + dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile + + or in case you want to run it directly with the docker image: + + .. code:: bash + + docker run \ + --rm \ + --user "$(id -u):$(id -g)" \ + --workdir /workspace \ + --volume "$(pwd)":/workspace \ + ghcr.io/patrickhoefler/dockerfilegraph:alpine \ + --output png \ + --dpi 200 \ + --max-label-length 50 \ + --filename Dockerfile \ + --legend + + (To run it for a different file, you can pass in a different argument to the flag `--filename`.) + + \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/contributing/overview.rst b/vllm-v0.6.2/docs/source/contributing/overview.rst new file mode 100644 index 0000000..ac2d2b2 --- /dev/null +++ b/vllm-v0.6.2/docs/source/contributing/overview.rst @@ -0,0 +1,70 @@ +Contributing to vLLM +===================== + +Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: + +- Identify and report any issues or bugs. +- Request or add support for a new model. +- Suggest or implement new features. +- Improve documentation or contribute a how-to guide. + +We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions. + +Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! + +License +------- + +See `LICENSE `_. + +Developing +---------- + +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details. + +Testing +------- + +.. code-block:: bash + + pip install -r requirements-dev.txt + + # linting and formatting + bash format.sh + # Static type checking + mypy + # Unit tests + pytest tests/ + +.. note:: Currently, the repository does not pass the ``mypy`` tests. + +Contribution Guidelines +======================= + +DCO and Signed-off-by +---------------------- + +When contributing changes to this project, you must agree to the `DCO `_. +Commits must include a ``Signed-off-by:`` header which certifies agreement with +the terms of the `DCO `_. + +Using ``-s`` with ``git commit`` will automatically add this header. + +Issues +------ + +If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. + +.. important:: + If you discover a security vulnerability, please follow the instructions `here `_. + +Pull Requests & Code Reviews +---------------------------- + +Please check the PR checklist in the `PR template `_ for a detailed guide for contribution. + +Thank You +--------- + +Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. +All of your contributions help make vLLM a great tool and community for everyone! diff --git a/vllm-v0.6.2/docs/source/contributing/profiling/profiling_index.rst b/vllm-v0.6.2/docs/source/contributing/profiling/profiling_index.rst new file mode 100644 index 0000000..a422b1f --- /dev/null +++ b/vllm-v0.6.2/docs/source/contributing/profiling/profiling_index.rst @@ -0,0 +1,48 @@ +============== +Profiling vLLM +============== + +We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` + +The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set. + +When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag. + +.. warning:: + + Only enable profiling in a development environment. + + +Traces can be visualized using https://ui.perfetto.dev/. + +.. tip:: + + Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. + +.. tip:: + + To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. + Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. + ``export VLLM_RPC_TIMEOUT=1800000`` + +Example commands and usage: +=========================== + +Offline Inference: +------------------ + +Refer to `examples/offline_inference_with_profiler.py `_ for an example. + + +OpenAI Server: +-------------- + +.. code-block:: bash + + VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B + +benchmark_serving.py: + +.. code-block:: bash + + python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/design/class_hierarchy.rst b/vllm-v0.6.2/docs/source/design/class_hierarchy.rst new file mode 100644 index 0000000..15f0c8c --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/class_hierarchy.rst @@ -0,0 +1,72 @@ +vLLM's Class Hierarchy +======================= + +This document describes the class hierarchy of vLLM. We will explain the relationships between the core classes, their responsibilities, and the design choices behind them to make vLLM more modular and extensible. + +1. **Entrypoints**: vLLM has two entrypoints: `command line usage `__ with ``vllm serve`` for launching an OpenAI-API compatible server, and `library-style usage `__ with the ``vllm.LLM`` class for running inference in a Python script. These are user-facing entrypoints that end-users interact with. Under the hood, both create an engine object to handle model inference. + +2. **Engine**: Each vLLM instance contains one engine object, orchestrating and serving as the control plane for model inference. Depending on the configuration, the engine can create multiple workers to handle the inference workload. + +3. **Worker**: A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their ``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while ``local_rank`` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. + +4. **Model Runner**: Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. + +5. **Model**: Every model runner object has one model object, which is the actual ``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various configurations affect the class we ultimately get. + +The following figure shows the class hierarchy of vLLM: + + .. figure:: ../assets/design/hierarchy.png + :alt: query + :width: 100% + :align: center + +There are several important design choices behind this class hierarchy: + +1. **Extensibility**: All classes in the hierarchy accept a configuration object containing all the necessary information. The `VllmConfig `__ class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily pass the configuration object around and access the configuration we need. Suppose we want to add a new feature (this is often the case given how fast the field of LLM inference is evolving) that only touches the model runner. We will have to add a new configuration option in the `VllmConfig` class. Since we pass the whole config object around, we only need to add the configuration option to the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. + +2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. + +.. note:: + + To support this change, all vLLM models' signatures have been updated to: + + .. code-block:: python + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + + To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + + .. code-block:: python + + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + + This way, the model can work with both old and new versions of vLLM. + +3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. + +One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. + +In summary, the complete config object ``VllmConfig`` can be treated as an engine-level global state that is shared among all vLLM classes. diff --git a/vllm-v0.6.2/docs/source/design/huggingface_integration.rst b/vllm-v0.6.2/docs/source/design/huggingface_integration.rst new file mode 100644 index 0000000..e6c1cea --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/huggingface_integration.rst @@ -0,0 +1,40 @@ +.. _huggingface_integration: + +Integration with HuggingFace +=================================== + +This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``. + +Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. + +1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: + + - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + + - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. + + - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. + +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. + +3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: + + - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. + + - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. + +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. + +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. + +Beyond that, there are two more things vLLM depends on HuggingFace for. + +1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__. + +2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. + + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: + +This completes the integration between vLLM and HuggingFace. + +In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/vllm-v0.6.2/docs/source/design/input_processing/input_processing_pipeline.rst b/vllm-v0.6.2/docs/source/design/input_processing/input_processing_pipeline.rst new file mode 100644 index 0000000..48abec8 --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/input_processing/input_processing_pipeline.rst @@ -0,0 +1,20 @@ +.. _input_processing_pipeline: + +Input Processing Pipeline +========================= + +1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). + +2. Tokenize the data if necessary. + +3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. + + - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. + +4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. + +5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. + +6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. + + - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/vllm-v0.6.2/docs/source/design/input_processing/model_inputs_index.rst b/vllm-v0.6.2/docs/source/design/input_processing/model_inputs_index.rst new file mode 100644 index 0000000..f0ec1fe --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/input_processing/model_inputs_index.rst @@ -0,0 +1,39 @@ +.. _input_processing: + +Input Processing +================ + +.. currentmodule:: vllm.inputs + +Each model can override parts of vLLM's :ref:`input processing pipeline ` via +:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + +Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input +data in addition to input prompt, but it can be extended to text-only language models when needed. + +Guides +++++++ + +.. toctree:: + :maxdepth: 1 + + input_processing_pipeline + +Module Contents ++++++++++++++++ + +LLM Engine Inputs +----------------- + +.. autoclass:: vllm.inputs.DecoderOnlyInputs + :members: + :show-inheritance: + +Registry +-------- + +.. autodata:: vllm.inputs.INPUT_REGISTRY + +.. automodule:: vllm.inputs.registry + :members: + :show-inheritance: diff --git a/vllm-v0.6.2/docs/source/design/kernel/paged_attention.rst b/vllm-v0.6.2/docs/source/design/kernel/paged_attention.rst new file mode 100644 index 0000000..ba4f7a2 --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/kernel/paged_attention.rst @@ -0,0 +1,525 @@ +vLLM Paged Attention +==================== + +- Currently, vLLM utilizes its own implementation of a multi-head query + attention kernel (``csrc/attention/attention_kernels.cu``). + This kernel is designed to be compatible with + vLLM's paged KV caches, where the key and value cache are stored in + separate blocks (note that this block concept differs from the GPU + thread block. So in a later document, I will refer to vLLM paged + attention block as "block", while refer to GPU thread block as + "thread block"). +- To achieve high performance, this kernel relies on a specially + designed memory layout and access method, specifically when threads + read data from global memory to shared memory. The purpose of this + document is to provide a high-level explanation of the kernel + implementation step by step, aiding those who wish to learn about the + vLLM multi-head query attention kernel. After going through this + document, users will likely have a better understanding and feel easier + to follow the actual implementation. +- Please note that this document may not cover all details, such as how + to calculate the correct index for the corresponding data or the dot + multiplication implementation. However, after reading this document + and becoming familiar with the high-level logic flow, it should be + easier for you to read the actual code and understand the details. + +Inputs +------ + +- The kernel function takes a list of arguments for the current thread + to perform its assigned work. The three most important arguments are + the input pointers ``q``, ``k_cache``, and ``v_cache``, which point + to query, key, and value data on global memory that need to be read + and processed. The output pointer ``out`` points to global memory + where the result should be written. These four pointers actually + refer to multi-dimensional arrays, but each thread only accesses the + portion of data assigned to it. I have omitted all other runtime + parameters here for simplicity. + + .. code:: cpp + + template< + typename scalar_t, + int HEAD_SIZE, + int BLOCK_SIZE, + int NUM_THREADS, + int PARTITION_SIZE = 0> + __device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. + ) + +- There are also a list of template arguments above the function + signature that are determined during compilation time. ``scalar_t`` + represents the data type of the query, key, and value data elements, + such as FP16. ``HEAD_SIZE`` indicates the number of elements in each + head. ``BLOCK_SIZE`` refers to the number of tokens in each block. + ``NUM_THREADS`` denotes the number of threads in each thread block. + ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For + simplicity, we assume this is 0 and tensor parallel is disabled). +- With these arguments, we need to perform a sequence of preparations. + This includes calculating the current head index, block index, and + other necessary variables. However, for now, we can ignore these + preparations and proceed directly to the actual calculations. It will + be easier to understand them once we grasp the entire flow. + +Concepts +-------- + +- Just before we dive into the calculation flow, I want to describe a + few concepts that are needed for later sections. However, you may + skip this section and return later if you encounter any confusing + terminologies. +- **Sequence**: A sequence represents a client request. For example, + the data pointed to by ``q`` has a shape of + ``[num_seqs, num_heads, head_size]``. That represents there are total + ``num_seqs`` of query sequence data are pointed by ``q``. Since this + kernel is a single query attention kernel, each sequence only has one + query token. Hence, the ``num_seqs`` equals the total number of tokens + that are processed in the batch. +- **Context**: The context consists of the generated tokens from the + sequence. For instance, ``["What", "is", "your"]`` are the context + tokens, and the input query token is ``"name"``. The model might + generate the token ``"?"``. +- **Vec**: The vec is a list of elements that are fetched and + calculated together. For query and key data, the vec size + (``VEC_SIZE``) is determined so that each thread group can fetch and + calculate 16 bytes of data at a time. For value data, the vec size + (``V_VEC_SIZE``) is determined so that each thread can fetch and + calculate 16 bytes of data at a time. For example, if the + ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the + ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8. +- **Thread group**: The thread group is a small group of + threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one + query token and one key token at a time. Each thread handles only a + portion of the token data. The total number of elements processed by + one thread group is referred as ``x``. For example, if the thread + group contains 2 threads and the head size is 8, then thread 0 + handles the query and key elements at index 0, 2, 4, 6, while thread + 1 handles the elements at index 1, 3, 5, 7. +- **Block**: The key and value cache data in vLLM are split into + blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``) + of tokens at one head. Each block may contain only a portion of the + whole context tokens. For example, if the block size is 16 and the + head size is 128, then for one head, one block can store 16 \* 128 = + 2048 elements. +- **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that + execute simultaneously on a stream multiprocessor (SM). In this + kernel, each warp processes the calculation between one query token + and key tokens of one entire block at a time (it may process multiple + blocks in multiple iterations). For example, if there are 4 warps and + 6 blocks for one context, the assignment would be like warp 0 handles + the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 + handles the 2nd block and warp 3 handles the 3rd block. +- **Thread block**: A thread block is a group of + threads(\ ``NUM_THREADS``) that can access the same shared memory. + Each thread block contains multiple warps(\ ``NUM_WARPS``), and in + this kernel, each thread block processes the calculation between one + query token and key tokens of a whole context. +- **Grid**: A grid is a collection of thread blocks and defines the + shape of the collection. In this kernel, the shape is + ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread + block only handles the calculation for one head, one sequence, and + one partition. + +Query +----- + +- This section will introduce how query data is stored in memory and + fetched by each thread. As mentioned above, each thread group fetches + one query token data, while each thread itself only handles a part of + one query token data. Within each warp, every thread group will fetch + the same query token data, but will multiply it with different key + token data. + + .. code:: cpp + + const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; + + .. figure:: ../../assets/kernel/query.png + :alt: query + :width: 70% + :align: center + + Query data of one token at one head + +- Each thread defines its own ``q_ptr`` which points to the assigned + query token data on global memory. For example, if ``VEC_SIZE`` is 4 + and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains + total of 128 elements divided into 128 / 4 = 32 vecs. + + .. figure:: ../../assets/kernel/q_vecs.png + :alt: q_vecs + :width: 70% + :align: center + + ``q_vecs`` for one thread group + + .. code:: cpp + + __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; + +- Next, we need to read the global memory data pointed to by ``q_ptr`` + into shared memory as ``q_vecs``. It is important to note that each + vecs is assigned to a different row. For example, if the + ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs, + while thread 1 handles the 1st row vecs. By reading the query data in + this way, neighboring threads like thread 0 and thread 1 can read + neighbor memory, achieving the memory coalescing to improve + performance. + +Key +--- + +- Similar to the "Query" section, this section introduces memory layout + and assignment for keys. While each thread group only handle one + query token one kernel run, it may handle multiple key tokens across + multiple iterations. Meanwhile, each warp will process multiple blocks + of key tokens in multiple iterations, ensuring that all context + tokens are processed by the entire thread group after the kernel run. + In this context, "handle" refers to performing the dot multiplication + between query data and key data. + + .. code:: cpp + + const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; + +- Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different + key token at different iterations. As shown above, that ``k_ptr`` + points to key token data based on ``k_cache`` at assigned block, + assigned head and assigned token. + + .. figure:: ../../assets/kernel/key.png + :alt: key + :width: 70% + :align: center + + Key data of all context tokens at one head + +- The diagram above illustrates the memory layout for key data. It + assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is + 8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each + rectangle represents all the elements for one key token at one head, + which will be processed by one thread group. The left half shows the + total 16 blocks of key token data for warp 0, while the right half + represents the remaining key token data for other warps or + iterations. Inside each rectangle, there are a total 32 vecs (128 + elements for one token) that will be processed by 2 threads (one + thread group) separately. + + .. figure:: ../../assets/kernel/k_vecs.png + :alt: k_vecs + :width: 70% + :align: center + + ``k_vecs`` for one thread + + .. code:: cpp + + K_vec k_vecs[NUM_VECS_PER_THREAD] + +- Next, we need to read the key token data from ``k_ptr`` and store + them on register memory as ``k_vecs``. We use register memory for + ``k_vecs`` because it will only be accessed by one thread once, + whereas ``q_vecs`` will be accessed by multiple threads multiple + times. Each ``k_vecs`` will contain multiple vectors for later + calculation. Each vec will be set at each inner iteration. The + assignment of vecs allows neighboring threads in a warp to read + neighboring memory together, which again promotes the memory + coalescing. For instance, thread 0 will read vec 0, while thread 1 + will read vec 1. In the next inner loop, thread 0 will read vec 2, + while thread 1 will read vec 3, and so on. +- You may still be a little confused about the overall flow. Don't + worry, please keep reading the next "QK" section. It will illustrate + the query and key calculation flow in a clearer and higher-level + manner. + +QK +--- + +- As shown the pseudo code below, before the entire for loop block, we + fetch the query data for one token and store it in ``q_vecs``. Then, + in the outer for loop, we iterate through different ``k_ptrs`` that + point to different tokens and prepare the ``k_vecs`` in the inner for + loop. Finally, we perform the dot multiplication between the + ``q_vecs`` and each ``k_vecs``. + + .. code:: cpp + + q_vecs = ... + for ... { + k_ptr = ... + for ... { + k_vecs[i] = ... + } + ... + float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); + } + +- As mentioned before, for each thread, it only fetches part of the + query and key token data at a time. However, there will be a cross + thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk`` + returned here is not just between part of the query and key token dot + multiplication, but actually a full result between entire query and + key token data. +- For example, if the value of ``HEAD_SIZE`` is 128 and + ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain + total 64 elements. However, the returned ``qk`` is actually the + result of dot multiplication between 128 query elements and 128 key + elements. If you want to learn more about the details of the dot + multiplication and reduction, you may refer to the implementation of + ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not + cover it in this document. + +Softmax +------- + +- Next, we need to calculate the normalized softmax for all ``qk``\ s, + as shown above, where each :math:`x` represents a ``qk``. To do this, + we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and + the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction + should be performed across the entire thread block, encompassing + results between the query token and all context key tokens. + + .. math:: + :nowrap: + + \begin{gather*} + m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ + \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} + \end{gather*} + +``qk_max`` and ``logits`` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Just right after we get the ``qk`` result, we can set the temporary + ``logits`` result with ``qk`` (In the end, the ``logits`` should + store the normalized softmax result). Also we can compare and collect + the ``qk_max`` for all ``qk``\ s that are calculated by current + thread group. + + .. code:: cpp + + if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + } + +- Please note that the ``logits`` here is on shared memory, so each + thread group will set the fields for its own assigned context tokens. + Overall, the size of logits should be number of context tokens. + + .. code:: cpp + + for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + + if (lane == 0) { + red_smem[warp_idx] = qk_max; + } + +- Then we need to get the reduced ``qk_max`` across each warp. The main + idea is to make threads in warp to communicate with each other and + get the final max ``qk`` . + + .. code:: cpp + + for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + qk_max = VLLM_SHFL_SYNC(qk_max, 0); + +- Finally, we can get the reduced ``qk_max`` from whole thread block by + compare the ``qk_max`` from all warps in this thread block. Then we + need to broadcast the final result to each thread. + +``exp_sum`` +~~~~~~~~~~~ + +- Similar to ``qk_max``, we need to get the reduced sum value from the + entire thread block too. + + .. code:: cpp + + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; + } + ... + exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); + +- Firstly, sum all exp values from each thread group, and meanwhile, + convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``. + Please note, the ``qk_max`` here is already the max ``qk`` across the + whole thread block. And then we can do reduction for ``exp_sum`` + across whole thread block just like the ``qk_max``. + + .. code:: cpp + + const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; + } + +- Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain + the final normalized softmax result as ``logits``. This ``logits`` + variable will be used for dot multiplication with the value data in + later steps. Now, it should store the normalized softmax result of + ``qk`` for all assigned context tokens. + +Value +----- + +.. figure:: ../../assets/kernel/value.png + :alt: value + :width: 70% + :align: center + + Value data of all context tokens at one head + +.. figure:: ../../assets/kernel/logits_vec.png + :alt: logits_vec + :width: 50% + :align: center + + ``logits_vec`` for one thread + +.. figure:: ../../assets/kernel/v_vec.png + :alt: v_vec + :width: 70% + :align: center + + List of ``v_vec`` for one thread + +- Now we need to retrieve the value data and perform dot multiplication + with ``logits``. Unlike query and key, there is no thread group + concept for value data. As shown in diagram, different from key token + memory layout, elements from the same column correspond to the same + value token. For one block of value data, there are ``HEAD_SIZE`` of + rows and ``BLOCK_SIZE`` of columns that are split into multiple + ``v_vecs``. +- Each thread always fetches ``V_VEC_SIZE`` elements from the same + ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread + retrieves multiple ``v_vec``\ s from different rows and the same + columns through multiple inner iterations. For each ``v_vec``, it + needs to be dot multiplied with the corresponding ``logits_vec``, + which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with + multiple inner iterations, each warp will process one block of value + tokens. And with multiple outer iterations, the whole context value + tokens are processd + + .. code:: cpp + + float accs[NUM_ROWS_PER_THREAD]; + for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } + } + +- As shown in the above pseudo code, in the outer loop, similar to + ``k_ptr``, ``logits_vec`` iterates over different blocks and reads + ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each + thread reads ``V_VEC_SIZE`` elements from the same tokens as a + ``v_vec`` and performs dot multiplication. It is important to note + that in each inner iteration, the thread fetches different head + position elements for the same tokens. The dot result is then + accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped + to a head position assigned to the current thread. +- For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each + thread fetches 8 value elements for 8 tokens at a time. Each element + is from different tokens at the same head position. If ``HEAD_SIZE`` + is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to + fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are + a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle + a whole block of value tokens. And each ``accs`` in each thread + contains 8 elements that accumulated at 8 different head positions. + For the thread 0, the ``accs`` variable will have 8 elements, which + are 0th, 32th … 224th elements of a value head that are accumulated + from all assigned 8 tokens. + +LV +--- +- Now, we need to perform reduction for ``accs`` within each warp. This + process allows each thread to accumulate the ``accs`` for the + assigned head positions of all tokens in one block. + + .. code:: cpp + + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { + acc += VLLM_SHFL_XOR_SYNC(acc, mask); + } + accs[i] = acc; + } + +- Next, we perform reduction for ``accs`` across all warps, allowing + each thread to have the accumulation of ``accs`` for the assigned + head positions of all context tokens. Please note that each ``accs`` + in every thread only stores the accumulation for a portion of + elements of the entire head for all context tokens. However, overall, + all results for output have been calculated but are just stored in + different thread register memory. + + .. code:: cpp + + float* out_smem = reinterpret_cast(shared_mem); + for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. + } + +Output +------ + +- Now we can write all of calculated result from local register memory + to final output global memory. + + .. code:: cpp + + scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; + +- First, we need to define the ``out_ptr`` variable, which points to + the start address of the assigned sequence and assigned head. + + .. code:: cpp + + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } + } + +- Finally, we need to iterate over different assigned head positions + and write out the corresponding accumulated result based on the + ``out_ptr``. diff --git a/vllm-v0.6.2/docs/source/design/multimodal/adding_multimodal_plugin.rst b/vllm-v0.6.2/docs/source/design/multimodal/adding_multimodal_plugin.rst new file mode 100644 index 0000000..b726138 --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/multimodal/adding_multimodal_plugin.rst @@ -0,0 +1,17 @@ +.. _adding_multimodal_plugin: + +Adding a Multimodal Plugin +========================== + +This document teaches you how to add a new modality to vLLM. + +Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. +For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`. + +The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s. + +.. note:: + This article is a work in progress. + +.. + TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/vllm-v0.6.2/docs/source/design/multimodal/multimodal_index.rst b/vllm-v0.6.2/docs/source/design/multimodal/multimodal_index.rst new file mode 100644 index 0000000..30f543a --- /dev/null +++ b/vllm-v0.6.2/docs/source/design/multimodal/multimodal_index.rst @@ -0,0 +1,69 @@ +.. _multi_modality: + +Multi-Modality +============== + +.. currentmodule:: vllm.multimodal + +vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` +via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. + +Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities +by following :ref:`this guide `. + +Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. + +.. + TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported + +Guides +++++++ + +.. toctree:: + :maxdepth: 1 + + adding_multimodal_plugin + +Module Contents ++++++++++++++++ + +.. automodule:: vllm.multimodal + +Registry +-------- + +.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY + +.. autoclass:: vllm.multimodal.MultiModalRegistry + :members: + :show-inheritance: + +Base Classes +------------ + +.. autodata:: vllm.multimodal.NestedTensors + +.. autodata:: vllm.multimodal.BatchedTensorInputs + +.. autoclass:: vllm.multimodal.MultiModalDataBuiltins + :members: + :show-inheritance: + +.. autodata:: vllm.multimodal.MultiModalDataDict + +.. autoclass:: vllm.multimodal.MultiModalKwargs + :members: + :show-inheritance: + +.. autoclass:: vllm.multimodal.MultiModalPlugin + :members: + :show-inheritance: + +Image Classes +------------- + +.. automodule:: vllm.multimodal.image + :members: + :show-inheritance: diff --git a/vllm-v0.6.2/docs/source/dev/engine/async_llm_engine.rst b/vllm-v0.6.2/docs/source/dev/engine/async_llm_engine.rst new file mode 100644 index 0000000..93fc310 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/engine/async_llm_engine.rst @@ -0,0 +1,6 @@ +AsyncLLMEngine +================================= + +.. autoclass:: vllm.AsyncLLMEngine + :members: + :show-inheritance: diff --git a/vllm-v0.6.2/docs/source/dev/engine/engine_index.rst b/vllm-v0.6.2/docs/source/dev/engine/engine_index.rst new file mode 100644 index 0000000..ba9ae55 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/engine/engine_index.rst @@ -0,0 +1,13 @@ +vLLM Engine +================================= + +.. automodule:: vllm.engine +.. currentmodule:: vllm.engine + +.. toctree:: + :maxdepth: 2 + :caption: Engines + + llm_engine + async_llm_engine + diff --git a/vllm-v0.6.2/docs/source/dev/engine/llm_engine.rst b/vllm-v0.6.2/docs/source/dev/engine/llm_engine.rst new file mode 100644 index 0000000..0b8c1e2 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/engine/llm_engine.rst @@ -0,0 +1,6 @@ +LLMEngine +================================= + +.. autoclass:: vllm.LLMEngine + :members: + :show-inheritance: diff --git a/vllm-v0.6.2/docs/source/dev/offline_inference/llm.rst b/vllm-v0.6.2/docs/source/dev/offline_inference/llm.rst new file mode 100644 index 0000000..83ba1b6 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/offline_inference/llm.rst @@ -0,0 +1,6 @@ +LLM Class +========= + +.. autoclass:: vllm.LLM + :members: + :show-inheritance: diff --git a/vllm-v0.6.2/docs/source/dev/offline_inference/llm_inputs.rst b/vllm-v0.6.2/docs/source/dev/offline_inference/llm_inputs.rst new file mode 100644 index 0000000..0d47281 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/offline_inference/llm_inputs.rst @@ -0,0 +1,14 @@ +LLM Inputs +========== + +.. autodata:: vllm.inputs.PromptType + +.. autoclass:: vllm.inputs.TextPrompt + :show-inheritance: + :members: + :member-order: bysource + +.. autoclass:: vllm.inputs.TokensPrompt + :show-inheritance: + :members: + :member-order: bysource diff --git a/vllm-v0.6.2/docs/source/dev/offline_inference/offline_index.rst b/vllm-v0.6.2/docs/source/dev/offline_inference/offline_index.rst new file mode 100644 index 0000000..27dfb0e --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/offline_inference/offline_index.rst @@ -0,0 +1,8 @@ +Offline Inference +================================= + +.. toctree:: + :maxdepth: 1 + + llm + llm_inputs diff --git a/vllm-v0.6.2/docs/source/dev/pooling_params.rst b/vllm-v0.6.2/docs/source/dev/pooling_params.rst new file mode 100644 index 0000000..334e028 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/pooling_params.rst @@ -0,0 +1,5 @@ +Pooling Parameters +================== + +.. autoclass:: vllm.PoolingParams + :members: diff --git a/vllm-v0.6.2/docs/source/dev/sampling_params.rst b/vllm-v0.6.2/docs/source/dev/sampling_params.rst new file mode 100644 index 0000000..f645941 --- /dev/null +++ b/vllm-v0.6.2/docs/source/dev/sampling_params.rst @@ -0,0 +1,5 @@ +Sampling Parameters +=================== + +.. autoclass:: vllm.SamplingParams + :members: diff --git a/vllm-v0.6.2/docs/source/generate_examples.py b/vllm-v0.6.2/docs/source/generate_examples.py new file mode 100644 index 0000000..79b49a1 --- /dev/null +++ b/vllm-v0.6.2/docs/source/generate_examples.py @@ -0,0 +1,61 @@ +import re +from pathlib import Path + + +def fix_case(text: str) -> str: + subs = [ + ("api", "API"), + ("llm", "LLM"), + ("vllm", "vLLM"), + ("openai", "OpenAI"), + ("multilora", "MultiLoRA"), + ] + for sub in subs: + text = re.sub(*sub, text, flags=re.IGNORECASE) + return text + + +def underline(title: str, character: str = "=") -> str: + return f"{title}\n{character * len(title)}" + + +def generate_title(filename: str) -> str: + # Turn filename into a title + title = filename.replace("_", " ").title() + # Handle acronyms and names + title = fix_case(title) + # Underline title + title = underline(title) + return title + + +def generate_examples(): + root_dir = Path(__file__).parent.parent.parent.resolve() + + # Source paths + script_dir = root_dir / "examples" + script_paths = sorted(script_dir.glob("*.py")) + + # Destination paths + doc_dir = root_dir / "docs/source/getting_started/examples" + doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths] + + # Generate the example docs for each example script + for script_path, doc_path in zip(script_paths, doc_paths): + script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}" + # Make script_path relative to doc_path and call it include_path + include_path = '../../../..' / script_path.relative_to(root_dir) + content = (f"{generate_title(doc_path.stem)}\n\n" + f"Source {script_url}.\n\n" + f".. literalinclude:: {include_path}\n" + " :language: python\n" + " :linenos:\n") + with open(doc_path, "w+") as f: + f.write(content) + + # Generate the toctree for the example scripts + with open(doc_dir / "examples_index.template.rst") as f: + examples_index = f.read() + with open(doc_dir / "examples_index.rst", "w+") as f: + example_docs = "\n ".join(path.stem for path in script_paths) + f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) diff --git a/vllm-v0.6.2/docs/source/getting_started/amd-installation.rst b/vllm-v0.6.2/docs/source/getting_started/amd-installation.rst new file mode 100644 index 0000000..ece5d78 --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/amd-installation.rst @@ -0,0 +1,178 @@ +.. _installation_rocm: + +Installation with ROCm +====================== + +vLLM supports AMD GPUs with ROCm 6.2. + +Requirements +------------ + +* OS: Linux +* Python: 3.9 -- 3.12 +* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +* ROCm 6.2 + +Installation options: + +#. :ref:`Build from source with docker ` +#. :ref:`Build from source ` + +.. _build_from_source_docker_rocm: + +Option 1: Build from source with docker (recommended) +----------------------------------------------------- + +You can build and install vLLM from source. + +First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +.. code-block:: console + + { + "features": { + "buildkit": true + } + } + + +`Dockerfile.rocm `_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. +* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) `_, this should be set to 0 before flash-attention supports this target. +* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` +* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo `_. The default is `ae7928c` +* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. + +Their values can be passed in when running ``docker build`` with ``--build-arg`` options. + + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +.. code-block:: console + + $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: + +.. code-block:: console + + $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . + +To run the above docker image ``vllm-rocm``, use the below command: + +.. code-block:: console + + $ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + + +.. _build_from_source_rocm: + +Option 2: Build from source +--------------------------- + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- `ROCm `_ +- `PyTorch `_ + +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + +Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started `_ + + +1. Install `Triton flash attention for ROCm `_ + +Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton `_ + + .. code-block:: console + + $ python3 -m pip install ninja cmake wheel pybind11 + $ pip uninstall -y triton + $ git clone https://github.com/OpenAI/triton.git + $ cd triton + $ git checkout e192dba + $ cd python + $ pip3 install . + $ cd ../.. + +.. note:: + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + + +2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm `_ + + +Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention `_ +Alternatively, wheels intended for vLLM use can be accessed under the releases. + +For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. +Note to get your gfx architecture, run `rocminfo |grep gfx`. + + .. code-block:: console + + $ git clone https://github.com/ROCm/flash-attention.git + $ cd flash-attention + $ git checkout 3cea2fb + $ git submodule update --init + $ GPU_ARCHS="gfx90a" python3 setup.py install + $ cd .. + +.. note:: + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + +3. Build vLLM. + + For example, vLLM on ROCM 6.2 can be built with the following steps: + + .. code-block:: console + + $ pip install --upgrade pip + + $ # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + + $ # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi + + $ # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt + + $ # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + + + This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation. + + +.. tip:: + + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + + +.. tip:: + - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide `_ for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to `vLLM performance optimization `_. + + diff --git a/vllm-v0.6.2/docs/source/getting_started/cpu-installation.rst b/vllm-v0.6.2/docs/source/getting_started/cpu-installation.rst new file mode 100644 index 0000000..69530fd --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/cpu-installation.rst @@ -0,0 +1,164 @@ +.. _installation_cpu: + +Installation with CPU +======================== + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel (``-tp = N``) +- Quantization (``INT8 W8A8, AWQ``) + +.. note:: + More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. + +Table of contents: + +#. :ref:`Requirements ` +#. :ref:`Quick start using Dockerfile ` +#. :ref:`Build from source ` +#. :ref:`Related runtime environment variables ` +#. :ref:`Intel Extension for PyTorch ` +#. :ref:`Performance tips ` + +.. _cpu_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Compiler: gcc/g++>=12.3.0 (optional, recommended) +* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) + +.. _cpu_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env + +.. _build_cpu_backend_from_source: + +Build from source +----------------- + +- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +.. code-block:: console + + $ sudo apt-get update -y + $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev + $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +- Second, install Python packages for vLLM CPU backend building: + +.. code-block:: console + + $ pip install --upgrade pip + $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy + $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + +- Finally, build and install vLLM CPU backend: + +.. code-block:: console + + $ VLLM_TARGET_DEVICE=cpu python setup.py install + +.. note:: + - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. + + - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. + +.. _env_intro: + +Related runtime environment variables +------------------------------------- + +- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. + +- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. + +.. _ipex_guidance: + +Intel Extension for PyTorch +--------------------------- + +- `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +.. _cpu_backend_performance_tips: + +Performance tips +----------------- + +- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: + +.. code-block:: console + + $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library + $ find / -name *libtcmalloc* # find the dynamic link library path + $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD + $ python examples/offline_inference.py # run vLLM + +- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: + +.. code-block:: console + + $ export VLLM_CPU_KVCACHE_SPACE=40 + $ export VLLM_CPU_OMP_THREADS_BIND=0-29 + $ vllm serve facebook/opt-125m + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + +.. code-block:: console + + $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + + # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. + CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ + 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + + # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 + $ export VLLM_CPU_OMP_THREADS_BIND=0-7 + $ python examples/offline_inference.py + +- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. + +CPU Backend Considerations +-------------------------- + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + .. code-block:: console + + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + + + * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/getting_started/debugging.rst b/vllm-v0.6.2/docs/source/getting_started/debugging.rst new file mode 100644 index 0000000..77bf550 --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/debugging.rst @@ -0,0 +1,142 @@ +.. _debugging: + +=============== +Debugging Tips +=============== + +This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. + +.. note:: + + Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. + +Hangs downloading a model +---------------------------------------- +If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. +It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue. + +Hangs loading a model from disk +---------------------------------------- +If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. +It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. + +.. note:: + + To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. + +Model is too large +---------------------------------------- +If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + +Enable more logging +---------------------------------------- +If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: + +- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. +- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem. +- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. +- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs. + +Incorrect network setup +---------------------------------------- +The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. +If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``. + +You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address. + +Error near ``self.graph.replay()`` +---------------------------------------- +If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. +To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. + +Incorrect hardware/driver +---------------------------------------- +If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. + +.. code-block:: python + + # Test PyTorch NCCL + import torch + import torch.distributed as dist + dist.init_process_group(backend="nccl") + local_rank = dist.get_rank() % torch.cuda.device_count() + torch.cuda.set_device(local_rank) + data = torch.FloatTensor([1,] * 128).to("cuda") + dist.all_reduce(data, op=dist.ReduceOp.SUM) + torch.cuda.synchronize() + value = data.mean().item() + world_size = dist.get_world_size() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("PyTorch NCCL is successful!") + + # Test PyTorch GLOO + gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") + cpu_data = torch.FloatTensor([1,] * 128) + dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) + value = cpu_data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("PyTorch GLOO is successful!") + + if world_size <= 1: + exit() + + # Test vLLM NCCL, with cuda graph + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + + pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) + pynccl.disabled = False + + s = torch.cuda.Stream() + with torch.cuda.stream(s): + data.fill_(1) + pynccl.all_reduce(data, stream=s) + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("vLLM NCCL is successful!") + + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(cuda_graph=g, stream=s): + pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + + data.fill_(1) + g.replay() + torch.cuda.current_stream().synchronize() + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("vLLM NCCL with cuda graph is successful!") + + dist.destroy_process_group(gloo_group) + dist.destroy_process_group() + +If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: + +.. code-block:: console + + $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py + +If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: + +.. code-block:: console + + $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py + +If the script runs successfully, you should see the message ``sanity check is successful!``. + +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + +.. note:: + + A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: + + - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. + - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. + + Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. + +Known Issues +---------------------------------------- +- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/vllm-v0.6.2/docs/source/getting_started/examples/examples_index.template.rst b/vllm-v0.6.2/docs/source/getting_started/examples/examples_index.template.rst new file mode 100644 index 0000000..1b34ccc --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/examples/examples_index.template.rst @@ -0,0 +1,8 @@ +Examples +================================= + +.. toctree:: + :maxdepth: 1 + :caption: Scripts + + %EXAMPLE_DOCS% diff --git a/vllm-v0.6.2/docs/source/getting_started/gaudi-installation.rst b/vllm-v0.6.2/docs/source/getting_started/gaudi-installation.rst new file mode 100644 index 0000000..68c1a56 --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/gaudi-installation.rst @@ -0,0 +1,402 @@ +Installation with Intel® Gaudi® AI Accelerators +=============================================== + +This README provides instructions on running vLLM with Intel Gaudi devices. + +Requirements and Installation +============================= + +Please follow the instructions provided in the `Gaudi Installation +Guide `__ +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the `Optimizing Training Platform +Guide `__. + +Requirements +------------ + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 + + +Quick start using Dockerfile +---------------------------- +.. code:: console + + $ docker build -f Dockerfile.hpu -t vllm-hpu-env . + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env + + +.. tip:: + If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered. + + +Build from source +----------------- + +Environment verification +~~~~~~~~~~~~~~~~~~~~~~~~ + +To verify that the Intel Gaudi software was correctly installed, run: + +.. code:: console + + $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible + $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed + $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed + $ pip list | grep neural # verify that neural_compressor is installed + +Refer to `Intel Gaudi Software Stack +Verification `__ +for more details. + +Run Docker Image +~~~~~~~~~~~~~~~~ + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the `Intel Gaudi +documentation `__ +for more details. + +Use the following commands to run a Docker image: + +.. code:: console + + $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + +Build and Install vLLM +~~~~~~~~~~~~~~~~~~~~~~ + +To build and install vLLM from source, run: + +.. code:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python setup.py develop + + +Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: + +.. code:: console + + $ git clone https://github.com/HabanaAI/vllm-fork.git + $ cd vllm-fork + $ git checkout habana_main + $ python setup.py develop + + +Supported Features +================== + +- `Offline batched + inference `__ +- Online inference via `OpenAI-Compatible + Server `__ +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with `HPU Graphs `__ + for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) + +Unsupported Features +==================== + +- Beam search +- LoRA adapters +- Quantization +- Prefill chunking (mixed-batch inferencing) + +Supported Configurations +======================== + +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. + +- `meta-llama/Llama-2-7b `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-7b-chat-hf `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-70b `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Llama-2-70b-chat-hf `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling + +Performance Tuning +================== + +Execution modes +--------------- + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. + +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode + +.. warning:: + In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. + + +Bucketing mechanism +------------------- + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. + +.. note:: + Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. + +Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +.. code-block:: + + INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + +``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +.. code-block:: + + min = 2, step = 32, max = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) + +Example (without ramp-up) + +.. code-block:: + + min = 128, step = 128, max = 512 + => ramp_up = () + => stable = (128, 256, 384, 512) + => buckets = ramp_up + stable => (128, 256, 384, 512) + + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +.. warning:: + If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. + +.. note:: + Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. + +Warmup +------ + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +.. code-block:: + + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB + INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB + ... + INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB + INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB + ... + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +.. tip:: + Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. + +HPU Graph capture +----------------- + +`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. +Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +.. note:: + ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode +- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. + + +.. note:: + ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. + + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +.. code-block:: + + INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache + INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 + INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB + ... + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + ... + INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB + INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB + ... + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory + INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) + + +Recommended vLLM Parameters +--------------------------- + +- We recommend running inference on Gaudi 2 with ``block_size`` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see `Gaudi + Architecture `__). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +Environment variables +--------------------- + +**Diagnostic and profiling knobs:** + +- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default +- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default +- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default +- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default +- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default +- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - ``{phase}`` is either ``PROMPT`` or ``DECODE`` + - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` + - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` + - Default values: + + - Prompt: + - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` + - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` + - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` + + - Decode: + - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` + - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` + - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` + + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default +- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs + +Troubleshooting: Tweaking HPU Graphs +==================================== + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak ``gpu_memory_utilization`` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default ``gpu_memory_utilization`` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. + +- If this method is not efficient, you can disable ``HPUGraph`` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding ``--enforce-eager`` flag to + server (for online inference), or by passing ``enforce_eager=True`` + argument to LLM constructor (for offline inference). diff --git a/vllm-v0.6.2/docs/source/getting_started/installation.rst b/vllm-v0.6.2/docs/source/getting_started/installation.rst new file mode 100644 index 0000000..f02626b --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/installation.rst @@ -0,0 +1,219 @@ +.. _installation: + +============ +Installation +============ + +vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. + +Requirements +============ + +* OS: Linux +* Python: 3.9 -- 3.12 +* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +Install released versions +========================= + +You can install vLLM using pip: + +.. code-block:: console + + $ # (Recommended) Create a new conda environment. + $ conda create -n myenv python=3.10 -y + $ conda activate myenv + + $ # Install vLLM with CUDA 12.1. + $ pip install vllm + +.. note:: + + Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue `_ for more details. + +.. note:: + + As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. + We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: + + .. code-block:: console + + $ # Install vLLM with CUDA 11.8. + $ export VLLM_VERSION=0.6.1.post1 + $ export PYTHON_VERSION=310 + $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 + + In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. + + Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. + + +.. _install-the-latest-code: + +Install the latest code +======================= + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: + +.. code-block:: console + + $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + +If you want to access the wheels for previous commits, you can specify the commit hash in the URL: + +.. code-block:: console + + $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch + $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + +Another way to access the latest code is to use the docker images: + +.. code-block:: console + + $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch + $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT} + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +.. _build_from_source: + +Build from source +================= + +.. _python-only-build: + +Python-only build (without compilation) +--------------------------------------- + +If you only need to change Python code, you can simply build vLLM without compilation. + +The first step is to install the latest vLLM wheel: + +.. code-block:: console + + pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + +You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. + +After verifying that the installation is successful, you can use `the following script `_: + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python python_only_dev.py + +The script will: + +* Find the installed vLLM package in the current environment. +* Copy built files to the current directory. +* Rename the installed vLLM package. +* Symbolically link the current directory to the installed vLLM package. + +Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. + +Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: + +.. code-block:: console + + $ python python_only_dev.py --quit-dev + +The ``--quit-dev`` flag will: + +* Remove the symbolic link from the current directory to the vLLM package. +* Restore the original vLLM package from the backup. + +If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. + +.. note:: + + There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. + +Full build (with compilation) +----------------------------- + +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ pip install -e . + +.. tip:: + + Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . + As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + + +Use an existing PyTorch installation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: + +* Building vLLM with PyTorch nightly or a custom PyTorch build. +* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly `_, and then build vLLM on top of it. + +To build vLLM using an existing PyTorch installation: + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python use_existing_torch.py + $ pip install -r requirements-build.txt + $ pip install -e . --no-build-isolation + + +Troubleshooting +~~~~~~~~~~~~~~~ + +To avoid your system being overloaded, you can limit the number of compilation jobs +to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: + +.. code-block:: console + + $ export MAX_JOBS=6 + $ pip install -e . + +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. + +Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. + +.. code-block:: console + + $ # Use `--ipc=host` to make sure the shared memory is large enough. + $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 + +If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.: + +.. code-block:: console + + $ export CUDA_HOME=/usr/local/cuda + $ export PATH="${CUDA_HOME}/bin:$PATH" + +Here is a sanity check to verify that the CUDA Toolkit is correctly installed: + +.. code-block:: console + + $ nvcc --version # verify that nvcc is in your PATH + $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME + + +Unsupported OS build +-------------------- + +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. + +Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: + +.. code-block:: console + + $ export VLLM_TARGET_DEVICE=empty + $ pip install -e . diff --git a/vllm-v0.6.2/docs/source/getting_started/neuron-installation.rst b/vllm-v0.6.2/docs/source/getting_started/neuron-installation.rst new file mode 100644 index 0000000..025ba6e --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/neuron-installation.rst @@ -0,0 +1,140 @@ +.. _installation_neuron: + +Installation with Neuron +======================== + +vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. +Paged Attention and Chunked Prefill are currently in development and will be available soon. +Data types currently supported in Neuron SDK are FP16 and BF16. + +Requirements +------------ + +* OS: Linux +* Python: 3.9 -- 3.11 +* Accelerator: NeuronCore_v2 (in trn1/inf2 instances) +* Pytorch 2.0.1/2.1.1 +* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) + +Installation steps: + +- :ref:`Build from source ` + + - :ref:`Step 0. Launch Trn1/Inf2 instances ` + - :ref:`Step 1. Install drivers and tools ` + - :ref:`Step 2. Install transformers-neuronx and its dependencies ` + - :ref:`Step 3. Install vLLM from source ` + +.. _build_from_source_neuron: + +.. note:: + + The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. + +Build from source +----------------- + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +.. _launch_instances: + +Step 0. Launch Trn1/Inf2 instances +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS `_. + +- Please follow the instructions at `launch an Amazon EC2 Instance `_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. +- To get more information about instances sizes and pricing see: `Trn1 web page `_, `Inf2 web page `_ +- Select Ubuntu Server 22.04 TLS AMI +- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. +- After launching the instance, follow the instructions in `Connect to your instance `_ to connect to the instance + +.. _install_drivers: + +Step 1. Install drivers and tools +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron `_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: + +.. code-block:: console + + # Configure Linux for Neuron repository updates + . /etc/os-release + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`_ will be the backend to support inference on trn1/inf2 instances. +Follow the steps below to install transformer-neuronx package and its dependencies. + +.. code-block:: console + + # Install Python venv + sudo apt-get install -y python3.10-venv g++ + + # Create Python venv + python3.10 -m venv aws_neuron_venv_pytorch + + # Activate Python venv + source aws_neuron_venv_pytorch/bin/activate + + # Install Jupyter notebook kernel + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + + # Set pip repository pointing to the Neuron repository + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + + # Install wget, awscli + python -m pip install wget + python -m pip install awscli + + # Update Neuron Compiler and Framework + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + +.. _install_vllm: + +Step 3. Install vLLM from source +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ pip install -U -r requirements-neuron.txt + $ VLLM_TARGET_DEVICE="neuron" pip install . + +If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. diff --git a/vllm-v0.6.2/docs/source/getting_started/openvino-installation.rst b/vllm-v0.6.2/docs/source/getting_started/openvino-installation.rst new file mode 100644 index 0000000..5eeb7c7 --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/openvino-installation.rst @@ -0,0 +1,116 @@ +.. _installation_openvino: + +Installation with OpenVINO +========================== + +vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs `_). OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (``--enable-prefix-caching``) +- Chunked prefill (``--enable-chunked-prefill``) + +**Table of contents**: + +- :ref:`Requirements ` +- :ref:`Quick start using Dockerfile ` +- :ref:`Build from source ` +- :ref:`Performance tips ` +- :ref:`Limitations ` + +.. _openvino_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Instruction set architecture (ISA) requirement: at least AVX2. + +.. _openvino_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.openvino -t vllm-openvino-env . + $ docker run -it --rm vllm-openvino-env + +.. _install_openvino_backend_from_source: + +Install from source +------------------- + +- First, install Python. For example, on Ubuntu 22.04, you can run: + + .. code-block:: console + + $ sudo apt-get update -y + $ sudo apt-get install python3 + +- Second, install prerequisites vLLM OpenVINO backend installation: + + .. code-block:: console + + $ pip install --upgrade pip + $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + +- Finally, install vLLM with OpenVINO backend: + + .. code-block:: console + + $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + +- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html `_. + +.. _openvino_backend_performance_tips: + +Performance tips +---------------- + +vLLM OpenVINO backend environment variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default. + +- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` + +CPU performance tips +~~~~~~~~~~~~~~~~~~~~ + +CPU uses the following environment variables to control behavior: + +- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. + +- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. + +To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``) + +OpenVINO best known configuration for CPU is: + +.. code-block:: console + + $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 + +GPU performance tips +~~~~~~~~~~~~~~~~~~~~ +GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache). + +Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. + +OpenVINO best known configuration for GPU is: + +.. code-block:: console + + $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json + +.. _openvino_backend_limitations: + +Limitations +----------- + +- LoRA serving is not supported. + +- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. + +- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/vllm-v0.6.2/docs/source/getting_started/quickstart.rst b/vllm-v0.6.2/docs/source/getting_started/quickstart.rst new file mode 100644 index 0000000..0c0491c --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/quickstart.rst @@ -0,0 +1,181 @@ +.. _quickstart: + +========== +Quickstart +========== + +This guide will help you quickly get started with vLLM to: + +* :ref:`Run offline batched inference ` +* :ref:`Run OpenAI-compatible inference ` + +Prerequisites +-------------- +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +Installation +-------------- + +You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. + +.. code-block:: console + + $ conda create -n myenv python=3.10 -y + $ conda activate myenv + $ pip install vllm + +Please refer to the :ref:`installation documentation ` for more details on installing vLLM. + +.. _offline_batched_inference: + +Offline Batched Inference +------------------------- + +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. + +The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: + +- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. + +.. code-block:: python + + from vllm import LLM, SamplingParams + +The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. + +.. code-block:: python + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + +.. note:: + + By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. + +Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. + +.. code-block:: python + + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +.. _openai_compatible_server: + +OpenAI-Compatible Server +------------------------ + +vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. +By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. + +Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: + +.. code-block:: console + + $ vllm serve Qwen/Qwen2.5-1.5B-Instruct + +.. note:: + + By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. + +This server can be queried in the same format as OpenAI API. For example, to list the models: + +.. code-block:: console + + $ curl http://localhost:8000/v1/models + +You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. + +OpenAI Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once your server is started, you can query the model with input prompts: + +.. code-block:: console + + $ curl http://localhost:8000/v1/completions \ + $ -H "Content-Type: application/json" \ + $ -d '{ + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", + $ "prompt": "San Francisco is a", + $ "max_tokens": 7, + $ "temperature": 0 + $ }' + +Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: + +.. code-block:: python + + from openai import OpenAI + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a") + print("Completion result:", completion) + +A more detailed client example can be found `here `__. + +OpenAI Chat Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. + +You can use the `create chat completion `_ endpoint to interact with the model: + +.. code-block:: console + + $ curl http://localhost:8000/v1/chat/completions \ + $ -H "Content-Type: application/json" \ + $ -d '{ + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", + $ "messages": [ + $ {"role": "system", "content": "You are a helpful assistant."}, + $ {"role": "user", "content": "Who won the world series in 2020?"} + $ ] + $ }' + +Alternatively, you can use the ``openai`` python package: + +.. code-block:: python + + from openai import OpenAI + # Set OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + chat_response = client.chat.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me a joke."}, + ] + ) + print("Chat response:", chat_response) diff --git a/vllm-v0.6.2/docs/source/getting_started/tpu-installation.rst b/vllm-v0.6.2/docs/source/getting_started/tpu-installation.rst new file mode 100644 index 0000000..75ab2b6 --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/tpu-installation.rst @@ -0,0 +1,184 @@ +.. _installation_tpu: + +##################### +Installation with TPU +##################### + +Tensor Processing Units (TPUs) are Google's custom-developed application-specific +integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs +are available in different versions each with different hardware specifications. +For more information about TPUs, see `TPU System Architecture `_. +For more information on the TPU versions supported with vLLM, see: + +* `TPU v6e `_ +* `TPU v5e `_ +* `TPU v5p `_ +* `TPU v4 `_ + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +* `TPU v6e topologies `_ +* `TPU v5e topologies `_ +* `TPU v5p topologies `_ +* `TPU v4 topologies `_ + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see `TPU quota `_. + +For TPU pricing information, see `Cloud TPU pricing `_. + +You may need additional persistent storage for your TPU VMs. For more +information, see `Storage options for Cloud TPU data `_. + +Requirements +------------ + +* Google Cloud TPU VM +* TPU versions: v6e, v5e, v5p, v4 +* Python: 3.10 or newer + +Provision Cloud TPUs +==================== + +You can provision Cloud TPUs using the `Cloud TPU API `_` +or the `queued resources `_` +API. This section shows how to create TPUs using the queued resource API. +For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. +`Queued resources `_ +enable you to request Cloud TPU resources in a queued manner. When you request +queued resources, the request is added to a queue maintained by the Cloud TPU +service. When the requested resource becomes available, it's assigned to your +Google Cloud project for your immediate exclusive use. + +Provision a Cloud TPU with the queued resource API +-------------------------------------------------- +Create a TPU v5e with 4 TPU chips: + +.. code-block:: console + + gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ + --node-id TPU_NAME \ + --project PROJECT_ID \ + --zone ZONE \ + --accelerator-type ACCELERATOR_TYPE \ + --runtime-version RUNTIME_VERSION \ + --service-account SERVICE_ACCOUNT + +.. list-table:: Parameter descriptions + :header-rows: 1 + + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The `zone `_ where you + want to create your Cloud TPU. + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, followed by a + '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU + with 4 cores. For more information, see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` + +Connect to your TPU using SSH: + +.. code-block:: bash + + gcloud compute tpus tpu-vm ssh TPU_NAME + +Create and activate a Conda environment for vLLM: + +.. code-block:: bash + + conda create -n vllm python=3.10 -y + conda activate vllm + +Clone the vLLM repository and go to the vLLM directory: + +.. code-block:: bash + + git clone https://github.com/vllm-project/vllm.git && cd vllm + +Uninstall the existing `torch` and `torch_xla` packages: + +.. code-block:: bash + + pip uninstall torch torch-xla -y + +Install build dependencies: + +.. code-block:: bash + + pip install -r requirements-tpu.txt + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + +Run the setup script: + +.. code-block:: bash + + VLLM_TARGET_DEVICE="tpu" python setup.py develop + + +Provision Cloud TPUs with GKE +----------------------------- + +For more information about using TPUs with GKE, see +https://cloud.google.com/kubernetes-engine/docs/how-to/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus + +.. _build_docker_tpu: + +Build a docker image with :code:`Dockerfile.tpu` +------------------------------------------------ + +You can use `Dockerfile.tpu `_ +to build a Docker image with TPU support. + +.. code-block:: console + + $ docker build -f Dockerfile.tpu -t vllm-tpu . + +Run the Docker image with the following command: + +.. code-block:: console + + $ # Make sure to add `--privileged --net host --shm-size=16G`. + $ docker run --privileged --net host --shm-size=16G -it vllm-tpu + +.. note:: + + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. + The compilation time may take 20~30 minutes in the first run. + However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). + +.. tip:: + + If you encounter the following error: + + .. code-block:: console + + from torch._C import * # noqa: F403 + ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + + + Install OpenBLAS with the following command: + + .. code-block:: console + + $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + diff --git a/vllm-v0.6.2/docs/source/getting_started/xpu-installation.rst b/vllm-v0.6.2/docs/source/getting_started/xpu-installation.rst new file mode 100644 index 0000000..b1868ac --- /dev/null +++ b/vllm-v0.6.2/docs/source/getting_started/xpu-installation.rst @@ -0,0 +1,80 @@ +.. _installation_xpu: + +Installation with XPU +======================== + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +#. :ref:`Requirements ` +#. :ref:`Quick start using Dockerfile ` +#. :ref:`Build from source ` + +.. _xpu_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Supported Hardware: Intel Data Center GPU, Intel ARC GPU +* OneAPI requirements: oneAPI 2024.2 + +.. _xpu_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env + +.. _build_xpu_backend_from_source: + +Build from source +----------------- + +- First, install required driver and intel OneAPI 2024.2 or later. + +- Second, install Python packages for vLLM XPU backend building: + +.. code-block:: console + + $ source /opt/intel/oneapi/setvars.sh + $ pip install --upgrade pip + $ pip install -v -r requirements-xpu.txt + +- Finally, build and install vLLM XPU backend: + +.. code-block:: console + + $ VLLM_TARGET_DEVICE=xpu python setup.py install + +.. note:: + - FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. + + +Distributed inference and serving +--------------------------------- + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +.. code-block:: console + + $ python -m vllm.entrypoints.openai.api_server \ + $ --model=facebook/opt-13b \ + $ --dtype=bfloat16 \ + $ --device=xpu \ + $ --max_model_len=1024 \ + $ --distributed-executor-backend=ray \ + $ --pipeline-parallel-size=2 \ + $ -tp=8 + +By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/vllm-v0.6.2/docs/source/index.rst b/vllm-v0.6.2/docs/source/index.rst new file mode 100644 index 0000000..a2abd29 --- /dev/null +++ b/vllm-v0.6.2/docs/source/index.rst @@ -0,0 +1,179 @@ +Welcome to vLLM! +================ + +.. figure:: ./assets/logos/vllm-logo-text-light.png + :width: 60% + :align: center + :alt: vLLM + :class: no-scaled-link + +.. raw:: html + +

+ Easy, fast, and cheap LLM serving for everyone + +

+ +

+ + Star + Watch + Fork +

+ + + +vLLM is a fast and easy-to-use library for LLM inference and serving. + +vLLM is fast with: + +* State-of-the-art serving throughput +* Efficient management of attention key and value memory with **PagedAttention** +* Continuous batching of incoming requests +* Fast model execution with CUDA/HIP graph +* Quantization: `GPTQ `_, `AWQ `_, INT4, INT8, and FP8 +* Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +* Speculative decoding +* Chunked prefill + +vLLM is flexible and easy to use with: + +* Seamless integration with popular HuggingFace models +* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +* Tensor parallelism and pipeline parallelism support for distributed inference +* Streaming outputs +* OpenAI-compatible API server +* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +* Prefix caching support +* Multi-lora support + +For more information, check out the following: + +* `vLLM announcing blog post `_ (intro to PagedAttention) +* `vLLM paper `_ (SOSP 2023) +* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. +* :ref:`vLLM Meetups `. + + +Documentation +------------- + +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + getting_started/installation + getting_started/amd-installation + getting_started/openvino-installation + getting_started/cpu-installation + getting_started/gaudi-installation + getting_started/neuron-installation + getting_started/tpu-installation + getting_started/xpu-installation + getting_started/quickstart + getting_started/debugging + getting_started/examples/examples_index + +.. toctree:: + :maxdepth: 1 + :caption: Serving + + serving/openai_compatible_server + serving/deploying_with_docker + serving/deploying_with_k8s + serving/deploying_with_nginx + serving/distributed_serving + serving/metrics + serving/env_vars + serving/usage_stats + serving/integrations + serving/tensorizer + serving/compatibility_matrix + serving/faq + +.. toctree:: + :maxdepth: 1 + :caption: Models + + models/supported_models + models/adding_model + models/enabling_multimodal_inputs + models/engine_args + models/lora + models/vlm + models/spec_decode + models/performance + +.. toctree:: + :maxdepth: 1 + :caption: Quantization + + quantization/supported_hardware + quantization/auto_awq + quantization/bnb + quantization/gguf + quantization/int8 + quantization/fp8 + quantization/fp8_e5m2_kvcache + quantization/fp8_e4m3_kvcache + +.. toctree:: + :maxdepth: 1 + :caption: Automatic Prefix Caching + + automatic_prefix_caching/apc + automatic_prefix_caching/details + +.. toctree:: + :maxdepth: 1 + :caption: Performance + + performance/benchmarks + +.. Community: User community resources + +.. toctree:: + :maxdepth: 1 + :caption: Community + + community/meetups + community/sponsors + +.. API Documentation: API reference aimed at vllm library usage + +.. toctree:: + :maxdepth: 2 + :caption: API Documentation + + dev/sampling_params + dev/pooling_params + dev/offline_inference/offline_index + dev/engine/engine_index + +.. Design: docs about vLLM internals + +.. toctree:: + :maxdepth: 2 + :caption: Design + + design/class_hierarchy + design/huggingface_integration + design/input_processing/model_inputs_index + design/kernel/paged_attention + design/multimodal/multimodal_index + +.. For Developers: contributing to the vLLM project + +.. toctree:: + :maxdepth: 2 + :caption: For Developers + + contributing/overview + contributing/profiling/profiling_index + contributing/dockerfile/dockerfile + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` diff --git a/vllm-v0.6.2/docs/source/models/adding_model.rst b/vllm-v0.6.2/docs/source/models/adding_model.rst new file mode 100644 index 0000000..c6d88cc --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/adding_model.rst @@ -0,0 +1,141 @@ +.. _adding_a_new_model: + +Adding a New Model +================== + +This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into vLLM. + +.. note:: + The complexity of adding a new model depends heavily on the model's architecture. + The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. + However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. + +.. note:: + By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, + please follow :ref:`this guide ` after implementing the model here. + +.. tip:: + If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub `_ repository. + We will be happy to help you out! + + +0. Fork the vLLM repository +-------------------------------- + +Start by forking our `GitHub`_ repository and then :ref:`build it from source `. +This gives you the ability to modify the codebase and test your model. + +.. tip:: + If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. + +1. Bring your model code +------------------------ + +Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models `_ directory. +For instance, vLLM's `OPT model `_ was adapted from the HuggingFace's `modeling_opt.py `_ file. + +.. warning:: + When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. + + +2. Rewrite the :code:`forward` methods +-------------------------------------- + +Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps: + +1. Remove any unnecessary code, such as the code only used for training. +2. Change the input parameters: + +.. code-block:: diff + + def forward( + self, + input_ids: torch.Tensor, + - attention_mask: Optional[torch.Tensor] = None, + - position_ids: Optional[torch.LongTensor] = None, + - past_key_values: Optional[List[torch.FloatTensor]] = None, + - inputs_embeds: Optional[torch.FloatTensor] = None, + - labels: Optional[torch.LongTensor] = None, + - use_cache: Optional[bool] = None, + - output_attentions: Optional[bool] = None, + - output_hidden_states: Optional[bool] = None, + - return_dict: Optional[bool] = None, + - ) -> Union[Tuple, CausalLMOutputWithPast]: + + positions: torch.Tensor, + + kv_caches: List[torch.Tensor], + + attn_metadata: AttentionMetadata, + + ) -> Optional[SamplerOutput]: + +1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. +2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture. + +.. note:: + Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. + If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. + + +3. (Optional) Implement tensor parallelism and quantization support +------------------------------------------------------------------- + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +4. Implement the weight loading logic +------------------------------------- + +You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +5. Register your model +---------------------- + +Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. + +6. Out-of-Tree Model Integration +-------------------------------------------- + +We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5. + +Just add the following lines in your code: + +.. code-block:: python + + from vllm import ModelRegistry + from your_code import YourModelForCausalLM + ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) + +If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +.. code-block:: python + + from vllm import ModelRegistry + + ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") + +.. important:: + If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + Read more about that :ref:`here `. + +If you are running api server with :code:`vllm serve `, you can wrap the entrypoint with the following code: + +.. code-block:: python + + from vllm import ModelRegistry + from your_code import YourModelForCausalLM + ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) + + if __name__ == '__main__': + import runpy + runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') + +Save the above code in a file and run it with :code:`python your_file.py `. diff --git a/vllm-v0.6.2/docs/source/models/enabling_multimodal_inputs.rst b/vllm-v0.6.2/docs/source/models/enabling_multimodal_inputs.rst new file mode 100644 index 0000000..49b5285 --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/enabling_multimodal_inputs.rst @@ -0,0 +1,147 @@ +.. _enabling_multimodal_inputs: + +Enabling Multimodal Inputs +========================== + +This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal ` inputs. + +.. seealso:: + :ref:`adding_a_new_model` + + +1. Update the base vLLM model +----------------------------- + +It is assumed that you have already implemented the model in vLLM according to :ref:`these steps `. +Further update the model as follows: + +- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + + .. code-block:: diff + + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + + .. note:: + The model class does not have to be named :code:`*ForCausalLM`. + Check out `the HuggingFace Transformers documentation `__ for some examples. + +- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward` + for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + .. code-block:: diff + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + + +2. Register input mappers +------------------------- + +For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper `. +This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`. + +.. code-block:: diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + from vllm.multimodal import MULTIMODAL_REGISTRY + + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + +A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. + +.. seealso:: + :ref:`input_processing_pipeline` + + +3. Register maximum number of multi-modal tokens +------------------------------------------------ + +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item +and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. + +.. code-block:: diff + + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + +Here are some examples: + +- Image inputs (static feature size): `LLaVA-1.5 Model `__ +- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ + +.. seealso:: + :ref:`input_processing_pipeline` + + +4. (Optional) Register dummy data +--------------------------------- + +During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. +In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data `. + +.. code-block:: diff + + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + +.. note:: + The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. + +Here are some examples: + +- Image inputs (static feature size): `LLaVA-1.5 Model `__ +- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ + +.. seealso:: + :ref:`input_processing_pipeline` + + +5. (Optional) Register input processor +-------------------------------------- + +Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. +This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call. +You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor `. + +.. code-block:: diff + + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + + @INPUT_REGISTRY.register_input_processor() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + +A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. +Here are some examples: + +- Insert static number of image tokens: `LLaVA-1.5 Model `__ +- Insert dynamic number of image tokens: `LLaVA-NeXT Model `__ + +.. seealso:: + :ref:`input_processing_pipeline` diff --git a/vllm-v0.6.2/docs/source/models/engine_args.rst b/vllm-v0.6.2/docs/source/models/engine_args.rst new file mode 100644 index 0000000..e7ce8cd --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/engine_args.rst @@ -0,0 +1,23 @@ +.. _engine_args: + +Engine Arguments +================ + +Below, you can find an explanation of every engine argument for vLLM: + +.. argparse:: + :module: vllm.engine.arg_utils + :func: _engine_args_parser + :prog: vllm serve + :nodefaultconst: + +Async Engine Arguments +---------------------- + +Below are the additional arguments related to the asynchronous engine: + +.. argparse:: + :module: vllm.engine.arg_utils + :func: _async_engine_args_parser + :prog: vllm serve + :nodefaultconst: \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/models/lora.rst b/vllm-v0.6.2/docs/source/models/lora.rst new file mode 100644 index 0000000..ef0177e --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/lora.rst @@ -0,0 +1,225 @@ +.. _lora: + +Using LoRA adapters +=================== + +This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. + +LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`. + +Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save +them locally with + +.. code-block:: python + + from huggingface_hub import snapshot_download + + sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") + + +Then we instantiate the base model and pass in the ``enable_lora=True`` flag: + +.. code-block:: python + + from vllm import LLM, SamplingParams + from vllm.lora.request import LoRARequest + + llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) + + +We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter +of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and +the third parameter is the path to the LoRA adapter. + +.. code-block:: python + + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] + ) + + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", + ] + + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) + ) + + +Check out `examples/multilora_inference.py `_ +for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. + +Serving LoRA Adapters +--------------------- +LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use +``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server: + +.. code-block:: bash + + vllm serve meta-llama/Llama-2-7b-hf \ + --enable-lora \ + --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ + +.. note:: + The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. + +The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``, +etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along +with its base model: + +.. code-block:: bash + + curl localhost:8000/v1/models | jq . + { + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + ... + }, + { + "id": "sql-lora", + "object": "model", + ... + } + ] + } + +Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be +processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other +LoRA adapter requests if they were provided and ``max_loras`` is set high enough). + +The following is an example request + +.. code-block:: bash + + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sql-lora", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' | jq + + +Dynamically serving LoRA Adapters +--------------------------------- + +In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading +LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility +to change models on-the-fly is needed. + +Note: Enabling this feature in production environments is risky as user may participate model adapter management. + +To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` +is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. + +.. code-block:: bash + + export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True + + +Loading a LoRA Adapter: + +To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary +details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. + +Example request to load a LoRA adapter: + +.. code-block:: bash + + curl -X POST http://localhost:8000/v1/load_lora_adapter \ + -H "Content-Type: application/json" \ + -d '{ + "lora_name": "sql_adapter", + "lora_path": "/path/to/sql-lora-adapter" + }' + +Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter +cannot be found or loaded, an appropriate error message will be returned. + +Unloading a LoRA Adapter: + +To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint +with the name or ID of the adapter to be unloaded. + +Example request to unload a LoRA adapter: + +.. code-block:: bash + + curl -X POST http://localhost:8000/v1/unload_lora_adapter \ + -H "Content-Type: application/json" \ + -d '{ + "lora_name": "sql_adapter" + }' + + +New format for `--lora-modules` +------------------------------- + +In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: + +.. code-block:: bash + + --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ + +This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. +Now, you can specify a base_model_name alongside the name and path using JSON format. For example: + +.. code-block:: bash + + --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' + +To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. + + +Lora model lineage in model card +-------------------------------- + +The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: + +- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. +- The `root` field points to the artifact location of the lora adapter. + +.. code-block:: bash + + $ curl http://localhost:8000/v1/models + + { + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", + "parent": null, + "permission": [ + { + ..... + } + ] + }, + { + "id": "sql-lora", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", + "parent": meta-llama/Llama-2-7b-hf, + "permission": [ + { + .... + } + ] + } + ] + } diff --git a/vllm-v0.6.2/docs/source/models/performance.rst b/vllm-v0.6.2/docs/source/models/performance.rst new file mode 100644 index 0000000..23b5ab7 --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/performance.rst @@ -0,0 +1,65 @@ +.. _performance: + +Performance and Tuning +====================== + +Preemption +---------- +Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests. +The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes +available again. When this occurs, the following warning is printed: + +``` +WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 +``` + +While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency. +If you frequently encounter preemptions from the vLLM engine, consider the following actions: + +- Increase `gpu_memory_utilization`. The vLLM pre-allocates GPU cache by using gpu_memory_utilization% of memory. By increasing this utilization, you can provide more KV cache space. +- Decrease `max_num_seqs` or `max_num_batched_tokens`. This can reduce the number of concurrent requests in a batch, thereby requiring less KV cache space. +- Increase `tensor_parallel_size`. This approach shards model weights, so each GPU has more memory available for KV cache. + +You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. + +.. _chunked-prefill: + +Chunked Prefill +--------------- +vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. + +You can enable the feature by specifying ``--enable-chunked-prefill`` in the command line or setting ``enable_chunked_prefill=True`` in the LLM constructor. + +.. code-block:: python + + llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) + # Set max_num_batched_tokens to tune performance. + # NOTE: 512 is the default max_num_batched_tokens for chunked prefill. + # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) + +By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. +This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization. + +Once chunked prefill is enabled, the policy is changed to prioritize decode requests. +It batches all pending decode requests to the batch before scheduling any prefill. +When there are available token_budget (``max_num_batched_tokens``), it schedules pending prefills. +If a last pending prefill request cannot fit into ``max_num_batched_tokens``, it chunks it. + +This policy has two benefits: + +- It improves ITL and generation decode because decode requests are prioritized. +- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. + +You can tune the performance by changing ``max_num_batched_tokens``. +By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). +Smaller ``max_num_batched_tokens`` achieves better ITL because there are fewer prefills interrupting decodes. +Higher ``max_num_batched_tokens`` achieves better TTFT as you can put more prefill to the batch. + +- If ``max_num_batched_tokens`` is the same as ``max_model_len``, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). +- Note that the default value (512) of ``max_num_batched_tokens`` is optimized for ITL, and it may have lower throughput than the default scheduler. + +We recommend you set ``max_num_batched_tokens > 2048`` for throughput. + +See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). + +Please try out this feature and let us know your feedback via GitHub issues! \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/models/spec_decode.rst b/vllm-v0.6.2/docs/source/models/spec_decode.rst new file mode 100644 index 0000000..d57ffec --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/spec_decode.rst @@ -0,0 +1,207 @@ +.. _spec_decode: + +Speculative decoding in vLLM +============================ + +.. warning:: + Please note that speculative decoding in vLLM is not yet optimized and does + not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work + to optimize it is ongoing and can be followed in `this issue. `_ + +This document shows how to use `Speculative Decoding `_ with vLLM. +Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. + +Speculating with a draft model +------------------------------ + +The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. + +.. code-block:: python + + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +To perform the same with an online mode launch the server: + +.. code-block:: bash + + python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ + --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ + --num_speculative_tokens 5 --gpu_memory_utilization 0.8 + +Then use a client: + +.. code-block:: python + + from openai import OpenAI + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + # Completion API + stream = False + completion = client.completions.create( + model=model, + prompt="The future of AI is", + echo=False, + n=1, + stream=stream, + ) + + print("Completion results:") + if stream: + for c in completion: + print(c) + else: + print(completion) + +Speculating by matching n-grams in the prompt +--------------------------------------------- + +The following code configures vLLM to use speculative decoding where proposals are generated by +matching n-grams in the prompt. For more information read `this thread. `_ + +.. code-block:: python + + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="[ngram]", + num_speculative_tokens=5, + ngram_prompt_lookup_max=4, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +Speculating using MLP speculators +--------------------------------- + +The following code configures vLLM to use speculative decoding where proposals are generated by +draft models that conditioning draft predictions on both context vectors and sampled tokens. +For more information see `this blog `_ or +`this technical report `_. + +.. code-block:: python + + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="meta-llama/Meta-Llama-3.1-70B-Instruct", + tensor_parallel_size=4, + speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_draft_tensor_parallel_size=1, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +Note that these speculative models currently need to be run without tensor parallelism, although +it is possible to run the main model using tensor parallelism (see example above). Since the +speculative models are relatively small, we still see significant speedups. However, this +limitation will be fixed in a future release. + +A variety of speculative models of this type are available on HF hub: + +* `llama-13b-accelerator `_ +* `llama3-8b-accelerator `_ +* `codellama-34b-accelerator `_ +* `llama2-70b-accelerator `_ +* `llama3-70b-accelerator `_ +* `granite-3b-code-instruct-accelerator `_ +* `granite-8b-code-instruct-accelerator `_ +* `granite-7b-instruct-accelerator `_ +* `granite-20b-code-instruct-accelerator `_ + +Lossless guarantees of Speculative Decoding +------------------------------------------- +In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of +speculative decoding, breaking down the guarantees into three key areas: + +1. **Theoretical Losslessness** + - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might + cause slight variations in output distributions, as discussed + in `Accelerating Large Language Model Decoding with Speculative Sampling `_ + +2. **Algorithmic Losslessness** + - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: + + - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target + distribution. `View Test Code `_ + + - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling + without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, + provides a lossless guarantee. Almost all of the tests in `this directory `_ + verify this property using `this assertion implementation `_ + +3. **vLLM Logprob Stability** + - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the + same request across runs. For more details, see the FAQ section + titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. + + +**Conclusion** + +While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding +can occur due to following factors: + +- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. + +- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially + due to non-deterministic behavior in batched operations or numerical instability. + +**Mitigation Strategies** + +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. + +Resources for vLLM contributors +------------------------------- +* `A Hacker's Guide to Speculative Decoding in vLLM `_ +* `What is Lookahead Scheduling in vLLM? `_ +* `Information on batch expansion `_ +* `Dynamic speculative decoding `_ diff --git a/vllm-v0.6.2/docs/source/models/supported_models.rst b/vllm-v0.6.2/docs/source/models/supported_models.rst new file mode 100644 index 0000000..96a513d --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/supported_models.rst @@ -0,0 +1,633 @@ +.. _supported_models: + +Supported Models +================ + +vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers `_. +This page lists the model architectures that are currently supported by vLLM. +Alongside each architecture, we include some popular models that use it. + +For other models, you can check the :code:`config.json` file inside the model repository. +If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. + +.. tip:: + The easiest way to check if your model is really supported at runtime is to run the program below: + + .. code-block:: python + + from vllm import LLM + + llm = LLM(model=...) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + + If vLLM successfully generates text, it indicates that your model is supported. + +Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` +for instructions on how to implement your model in vLLM. +Alternatively, you can `open an issue on GitHub `_ to request vLLM support. + +.. note:: + To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: + + .. code-block:: shell + + $ export VLLM_USE_MODELSCOPE=True + + And use with :code:`trust_remote_code=True`. + + .. code-block:: python + + from vllm import LLM + + llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + +Text-only Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Text Generation +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`AquilaForCausalLM` + - Aquila, Aquila2 + - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. + - ✅︎ + - ✅︎ + * - :code:`ArcticForCausalLM` + - Arctic + - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc. + - + - ✅︎ + * - :code:`BaiChuanForCausalLM` + - Baichuan2, Baichuan + - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. + - ✅︎ + - ✅︎ + * - :code:`BloomForCausalLM` + - BLOOM, BLOOMZ, BLOOMChat + - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. + - + - ✅︎ + * - :code:`BartForConditionalGeneration` + - BART + - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. + - + - + * - :code:`ChatGLMModel` + - ChatGLM + - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. + - ✅︎ + - ✅︎ + * - :code:`CohereForCausalLM` + - Command-R + - :code:`CohereForAI/c4ai-command-r-v01`, etc. + - ✅︎ + - ✅︎ + * - :code:`DbrxForCausalLM` + - DBRX + - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc. + - + - ✅︎ + * - :code:`DeciLMForCausalLM` + - DeciLM + - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. + - + - ✅︎ + * - :code:`DeepseekForCausalLM` + - DeepSeek + - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. + - + - ✅︎ + * - :code:`DeepseekV2ForCausalLM` + - DeepSeek-V2 + - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. + - + - ✅︎ + * - :code:`ExaoneForCausalLM` + - EXAONE-3 + - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`FalconForCausalLM` + - Falcon + - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. + - + - ✅︎ + * - :code:`FalconMambaForCausalLM` + - FalconMamba + - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅︎ + - + * - :code:`GemmaForCausalLM` + - Gemma + - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. + - ✅︎ + - ✅︎ + * - :code:`Gemma2ForCausalLM` + - Gemma2 + - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. + - ✅︎ + - ✅︎ + * - :code:`GPT2LMHeadModel` + - GPT-2 + - :code:`gpt2`, :code:`gpt2-xl`, etc. + - + - ✅︎ + * - :code:`GPTBigCodeForCausalLM` + - StarCoder, SantaCoder, WizardCoder + - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. + - ✅︎ + - ✅︎ + * - :code:`GPTJForCausalLM` + - GPT-J + - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. + - + - ✅︎ + * - :code:`GPTNeoXForCausalLM` + - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. + - + - ✅︎ + * - :code:`GraniteForCausalLM` + - Granite 3.0, PowerLM + - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. + - ✅︎ + - ✅︎ + * - :code:`GraniteMoeForCausalLM` + - Granite 3.0 MoE, PowerMoE + - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. + - ✅︎ + - ✅︎ + * - :code:`InternLMForCausalLM` + - InternLM + - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. + - ✅︎ + - ✅︎ + * - :code:`InternLM2ForCausalLM` + - InternLM2 + - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. + - + - ✅︎ + * - :code:`JAISLMHeadModel` + - Jais + - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. + - + - ✅︎ + * - :code:`JambaForCausalLM` + - Jamba + - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. + - ✅︎ + - + * - :code:`LlamaForCausalLM` + - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. + - ✅︎ + - ✅︎ + * - :code:`MambaForCausalLM` + - Mamba + - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. + - + - + * - :code:`MiniCPMForCausalLM` + - MiniCPM + - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. + - ✅︎ + - ✅︎ + * - :code:`MiniCPM3ForCausalLM` + - MiniCPM3 + - :code:`openbmb/MiniCPM3-4B`, etc. + - ✅︎ + - ✅︎ + * - :code:`MistralForCausalLM` + - Mistral, Mistral-Instruct + - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ + - ✅︎ + * - :code:`MixtralForCausalLM` + - Mixtral-8x7B, Mixtral-8x7B-Instruct + - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc. + - ✅︎ + - ✅︎ + * - :code:`MPTForCausalLM` + - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter + - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. + - + - ✅︎ + * - :code:`NemotronForCausalLM` + - Nemotron-3, Nemotron-4, Minitron + - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + - ✅︎ + - ✅︎ + * - :code:`OLMoForCausalLM` + - OLMo + - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. + - + - ✅︎ + * - :code:`OLMoEForCausalLM` + - OLMoE + - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`OPTForCausalLM` + - OPT, OPT-IML + - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. + - + - ✅︎ + * - :code:`OrionForCausalLM` + - Orion + - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. + - + - ✅︎ + * - :code:`PhiForCausalLM` + - Phi + - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. + - ✅︎ + - ✅︎ + * - :code:`Phi3ForCausalLM` + - Phi-3 + - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`Phi3SmallForCausalLM` + - Phi-3-Small + - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. + - + - ✅︎ + * - :code:`PhiMoEForCausalLM` + - Phi-3.5-MoE + - :code:`microsoft/Phi-3.5-MoE-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`PersimmonForCausalLM` + - Persimmon + - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. + - + - ✅︎ + * - :code:`QWenLMHeadModel` + - Qwen + - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2ForCausalLM` + - Qwen2 + - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2MoeForCausalLM` + - Qwen2MoE + - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + - + - ✅︎ + * - :code:`StableLmForCausalLM` + - StableLM + - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. + - + - ✅︎ + * - :code:`Starcoder2ForCausalLM` + - Starcoder2 + - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc. + - + - ✅︎ + * - :code:`SolarForCausalLM` + - Solar Pro + - :code:`upstage/solar-pro-preview-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`XverseForCausalLM` + - XVERSE + - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. + - ✅︎ + - ✅︎ + +.. note:: + Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. + +Text Embedding +-------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Gemma2Model` + - Gemma2-based + - :code:`BAAI/bge-multilingual-gemma2`, etc. + - + - ✅︎ + * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. + - Llama-based + - :code:`intfloat/e5-mistral-7b-instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` + - Qwen2-based + - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc. + - ✅︎ + - ✅︎ + +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + +.. tip:: + You can override the model's pooling method by passing :code:`--override-pooler-config`. + +Reward Modeling +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForRewardModel` + - Qwen2-based + - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. + - ✅︎ + - ✅︎ + +.. note:: + As an interim measure, these models are supported in both offline and online inference via Embeddings API. + +Classification +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForSequenceClassification` + - Qwen2-based + - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. + - ✅︎ + - ✅︎ + +.. note:: + As an interim measure, these models are supported in both offline and online inference via Embeddings API. + + +Multimodal Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following modalities are supported depending on the model: + +- **T**\ ext +- **I**\ mage +- **V**\ ideo +- **A**\ udio + +Any combination of modalities joined by :code:`+` are supported. + +- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by :code:`/` are mutually exclusive. + +- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + +.. _supported_vlms: + +Text Generation +--------------- + +.. list-table:: + :widths: 25 25 15 25 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Inputs + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Blip2ForConditionalGeneration` + - BLIP-2 + - T + I\ :sup:`E` + - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. + - + - ✅︎ + * - :code:`ChameleonForConditionalGeneration` + - Chameleon + - T + I + - :code:`facebook/chameleon-7b` etc. + - + - ✅︎ + * - :code:`FuyuForCausalLM` + - Fuyu + - T + I + - :code:`adept/fuyu-8b` etc. + - + - ✅︎ + * - :code:`ChatGLMModel` + - GLM-4V + - T + I + - :code:`THUDM/glm-4v-9b` etc. + - + - ✅︎ + * - :code:`H2OVLChatModel` + - H2OVL + - T + I\ :sup:`E+` + - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. + - + - ✅︎ + * - :code:`Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. + - ✅︎ + - + * - :code:`InternVLChatModel` + - InternVL2 + - T + I\ :sup:`E+` + - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. + - + - ✅︎ + * - :code:`LlavaForConditionalGeneration` + - LLaVA-1.5 + - T + I\ :sup:`E+` + - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. + - + - ✅︎ + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT + - T + I\ :sup:`E+` + - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. + - + - ✅︎ + * - :code:`LlavaNextVideoForConditionalGeneration` + - LLaVA-NeXT-Video + - T + V + - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. + - + - ✅︎ + * - :code:`LlavaOnevisionForConditionalGeneration` + - LLaVA-Onevision + - T + I\ :sup:`+` + V\ :sup:`+` + - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + - + - ✅︎ + * - :code:`MiniCPMV` + - MiniCPM-V + - T + I\ :sup:`E+` + - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. + - ✅︎ + - ✅︎ + * - :code:`MllamaForConditionalGeneration` + - Llama 3.2 + - T + I\ :sup:`+` + - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. + - + - + * - :code:`MolmoForCausalLM` + - Molmo + - T + I + - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. + - + - ✅︎ + * - :code:`NVLM_D_Model` + - NVLM-D 1.0 + - T + I\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. + - + - ✅︎ + * - :code:`PaliGemmaForConditionalGeneration` + - PaliGemma + - T + I\ :sup:`E` + - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. + - + - ✅︎ + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision, Phi-3.5-Vision + - T + I\ :sup:`E+` + - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. + - + - ✅︎ + * - :code:`PixtralForConditionalGeneration` + - Pixtral + - T + I\ :sup:`+` + - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. + - + - ✅︎ + * - :code:`QWenLMHeadModel` + - Qwen-VL + - T + I\ :sup:`E+` + - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. + - ✅︎ + - ✅︎ + * - :code:`Qwen2AudioForConditionalGeneration` + - Qwen2-Audio + - T + A\ :sup:`+` + - :code:`Qwen/Qwen2-Audio-7B-Instruct` + - + - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL + - T + I\ :sup:`E+` + V\ :sup:`E+` + - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. + - ✅︎ + - ✅︎ + * - :code:`UltravoxModel` + - Ultravox + - T + A\ :sup:`E+` + - :code:`fixie-ai/ultravox-v0_3` + - + - ✅︎ + +| :sup:`E` Pre-computed embeddings can be inputted for this modality. +| :sup:`+` Multiple items can be inputted per text prompt for this modality. + +.. note:: + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + +.. note:: + For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. + For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 + +Multimodal Embedding +-------------------- + +.. list-table:: + :widths: 25 25 15 25 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Inputs + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - :code:`royokong/e5-v` + - + - ✅︎ + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision-based + - T + I + - :code:`TIGER-Lab/VLM2Vec-Full` + - 🚧 + - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - :code:`MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ + +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + +.. tip:: + You can override the model's pooling method by passing :code:`--override-pooler-config`. + +Model Support Policy +===================== + +At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: + +1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + +2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. + +3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + +4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + +5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. + +Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. + +Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. + +We have the following levels of testing for models: + +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests `_ for the models that have passed this test. +2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests `_ and `examples `_ for the models that have passed this test. +4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/vllm-v0.6.2/docs/source/models/vlm.rst b/vllm-v0.6.2/docs/source/models/vlm.rst new file mode 100644 index 0000000..bcbe50a --- /dev/null +++ b/vllm-v0.6.2/docs/source/models/vlm.rst @@ -0,0 +1,330 @@ +.. _vlm: + +Using VLMs +========== + +vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here `. +This document shows you how to run and serve these models using vLLM. + +.. note:: + We are actively iterating on VLM support. See `this RFC `_ for upcoming changes, + and `open an issue on GitHub `_ if you have any feedback or feature requests. + +Offline Inference +----------------- + +Single-image input +^^^^^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models. + +.. code-block:: python + + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`: + +* ``prompt``: The prompt should follow the format that is documented on HuggingFace. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. + +.. code-block:: python + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + + # Load the image using PIL.Image + image = PIL.Image.open(...) + + # Single prompt inference + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + # Inference with image embeddings as input + image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + # Inference with image embeddings as input with additional parameters + # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters. + mm_data = {} + + image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) + # For Qwen2VL, image_grid_thw is needed to calculate positional encoding. + mm_data['image'] = { + "image_embeds": image_embeds, + "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3), + } + # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image. + mm_data['image'] = { + "image_embeds": image_embeds, + "image_size_list": [image.size] # list of image sizes + } + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + # Batch inference + image_1 = PIL.Image.open(...) + image_2 = PIL.Image.open(...) + outputs = llm.generate( + [ + { + "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_1}, + }, + { + "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_2}, + } + ] + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +A code example can be found in `examples/offline_inference_vision_language.py `_. + +Multi-image input +^^^^^^^^^^^^^^^^^ + +Multi-image input is only supported for a subset of VLMs, as shown :ref:`here `. + +To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class. + +.. code-block:: python + + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept + ) + +Instead of passing in a single image, you can pass in a list of images. + +.. code-block:: python + + # Refer to the HuggingFace repo for the correct format to use + prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + + # Load the images using PIL.Image + image1 = PIL.Image.open(...) + image2 = PIL.Image.open(...) + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. + +Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: + +.. code-block:: python + + # Specify the maximum number of frames per video to be 4. This can be changed. + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + + # Create the request payload. + video_frames = ... # load your video making sure it only has the number of frames specified earlier. + message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], + } + for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + + # Perform inference and log output. + outputs = llm.chat([message]) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +Online Inference +---------------- + +OpenAI Vision API +^^^^^^^^^^^^^^^^^ + +You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API `_. + +Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server. + +.. code-block:: bash + + vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 + +.. important:: + Since OpenAI Vision API is based on `Chat Completions API `_, + a chat template is **required** to launch the API server. + + Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. + The chat template can be inferred based on the documentation on the model's HuggingFace repo. + For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `_. + +To consume the server, you can use the OpenAI client like in the example below: + +.. code-block:: python + + from openai import OpenAI + + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Single-image input inference + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + + # Multi-image input inference + image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" + image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. + +.. tip:: + Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, + and pass the file path as ``url`` in the API request. + +.. tip:: + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. + +.. note:: + + By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_IMAGE_FETCH_TIMEOUT= + +Chat Embeddings API +^^^^^^^^^^^^^^^^^^^ + +vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, +where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. + +.. tip:: + The schema of ``messages`` is exactly the same as in Chat Completions API. + +In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. + +.. code-block:: bash + + vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja + +.. important:: + + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` + to run this model in embedding mode instead of text generation mode. + +.. important:: + + VLM2Vec does not expect chat-based input. We use a `custom chat template `_ + to combine the text and images together. + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: + +.. code-block:: python + + import requests + + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + +Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. + +.. code-block:: bash + + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + +.. important:: + + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, + which is handled by the jinja template. + +.. important:: + + Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. + +A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/vllm-v0.6.2/docs/source/performance/benchmarks.rst b/vllm-v0.6.2/docs/source/performance/benchmarks.rst new file mode 100644 index 0000000..6d4d7b5 --- /dev/null +++ b/vllm-v0.6.2/docs/source/performance/benchmarks.rst @@ -0,0 +1,33 @@ +.. _benchmarks: + +================ +Benchmark Suites +================ + +vLLM contains two sets of benchmarks: + ++ :ref:`Performance benchmarks ` ++ :ref:`Nightly benchmarks ` + + +.. _performance_benchmarks: + +Performance Benchmarks +---------------------- + +The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM. + +The latest performance results are hosted on the public `vLLM Performance Dashboard `_. + +More information on the performance benchmarks and their parameters can be found `here `__. + +.. _nightly_benchmarks: + +Nightly Benchmarks +------------------ + +These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. + +The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_. + +More information on the nightly benchmarks and their parameters can be found `here `__. \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/quantization/auto_awq.rst b/vllm-v0.6.2/docs/source/quantization/auto_awq.rst new file mode 100644 index 0000000..8eb6fa2 --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/auto_awq.rst @@ -0,0 +1,79 @@ +.. _auto_awq: + +AutoAWQ +================== + +.. warning:: + + Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better + accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency + inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. + +To create a new 4-bit quantized model, you can leverage `AutoAWQ `_. +Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. +The main benefits are lower latency and memory usage. + +You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface `_. + +.. code-block:: console + + $ pip install autoawq + +After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: + +.. code-block:: python + + from awq import AutoAWQForCausalLM + from transformers import AutoTokenizer + + model_path = 'mistralai/Mistral-7B-Instruct-v0.2' + quant_path = 'mistral-instruct-v0.2-awq' + quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + + # Load model + model = AutoAWQForCausalLM.from_pretrained( + model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + # Quantize + model.quantize(tokenizer, quant_config=quant_config) + + # Save quantized model + model.save_quantized(quant_path) + tokenizer.save_pretrained(quant_path) + + print(f'Model is quantized and saved at "{quant_path}"') + +To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ `_ with the following command: + +.. code-block:: console + + $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq + +AWQ models are also supported directly through the LLM entrypoint: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/docs/source/quantization/bnb.rst b/vllm-v0.6.2/docs/source/quantization/bnb.rst new file mode 100644 index 0000000..682938c --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/bnb.rst @@ -0,0 +1,43 @@ +.. _bits_and_bytes: + +BitsAndBytes +================== + +vLLM now supports `BitsAndBytes `_ for more efficient model inference. +BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. +Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. + +Below are the steps to utilize BitsAndBytes with vLLM. + +.. code-block:: console + + $ pip install bitsandbytes>=0.44.0 + +vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. + +You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes. +And usually, these repositories have a config.json file that includes a quantization_config section. + +Read quantized checkpoint. +-------------------------- + +.. code-block:: python + + from vllm import LLM + import torch + # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. + model_id = "unsloth/tinyllama-bnb-4bit" + llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ + quantization="bitsandbytes", load_format="bitsandbytes") + +Inflight quantization: load as 4bit quantization +------------------------------------------------ + +.. code-block:: python + + from vllm import LLM + import torch + model_id = "huggyllama/llama-7b" + llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ + quantization="bitsandbytes", load_format="bitsandbytes") + diff --git a/vllm-v0.6.2/docs/source/quantization/fp8.rst b/vllm-v0.6.2/docs/source/quantization/fp8.rst new file mode 100644 index 0000000..aacd07a --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/fp8.rst @@ -0,0 +1,204 @@ +.. _fp8: + +FP8 W8A8 +================== + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. +Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. + +Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. + +The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: + +- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``. +- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values. + +.. note:: + + FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). + FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. + +Quick Start with Online Dynamic Quantization +-------------------------------------------- + +Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor. + +In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. + +.. code-block:: python + + from vllm import LLM + model = LLM("facebook/opt-125m", quantization="fp8") + # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB + result = model.generate("Hello, my name is") + +.. warning:: + + Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. + +Installation +------------ + +To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor `_ library: + +.. code-block:: console + + $ pip install llmcompressor==0.1.0 + +Quantization Process +-------------------- + +The quantization process involves three main steps: + +1. Loading the model +2. Applying quantization +3. Evaluating accuracy in vLLM + +1. Loading the Model +^^^^^^^^^^^^^^^^^^^^ + +Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: + +.. code-block:: python + + from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoTokenizer + + MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + + model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto") + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +2. Applying Quantization +^^^^^^^^^^^^^^^^^^^^^^^^ + +For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses: + +- Static, per-channel quantization on the weights +- Dynamic, per-token quantization on the activations + +Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. + +.. code-block:: python + + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import QuantizationModifier + + # Configure the simple PTQ quantization + recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + + # Apply the quantization algorithm. + oneshot(model=model, recipe=recipe) + + # Save the model. + SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" + model.save_pretrained(SAVE_DIR) + tokenizer.save_pretrained(SAVE_DIR) + +3. Evaluating Accuracy +^^^^^^^^^^^^^^^^^^^^^^ + +Install ``vllm`` and ``lm-evaluation-harness``: + +.. code-block:: console + + $ pip install vllm lm-eval==0.4.4 + +Load and run the model in ``vllm``: + +.. code-block:: python + + from vllm import LLM + model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") + model.generate("Hello my name is") + +Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``): + +.. note:: + + Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations. + +.. code-block:: console + + $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic + $ lm_eval \ + --model vllm \ + --model_args pretrained=$MODEL,add_bos_token=True \ + --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 + +Here's an example of the resulting scores: + +.. code-block:: text + + |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| + |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| + |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| + | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| + +Troubleshooting and Support +--------------------------- + +If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. + + +Deprecated Flow +------------------ + +.. note:: + + The following information is preserved for reference and search purposes. + The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above. + +For static per-tensor offline quantization to FP8, please install the `AutoFP8 library `_. + +.. code-block:: bash + + git clone https://github.com/neuralmagic/AutoFP8.git + pip install -e AutoFP8 + +This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed. + +Offline Quantization with Static Activation Scaling Factors +----------------------------------------------------------- + +You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument. + +.. code-block:: python + + from datasets import load_dataset + from transformers import AutoTokenizer + from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + + pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" + quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) + tokenizer.pad_token = tokenizer.eos_token + + # Load and tokenize 512 dataset samples for calibration of activation scales + ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) + examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] + examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") + + # Define quantization config with static activation scales + quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") + + # Load the model, quantize, and save checkpoint + model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) + model.quantize(examples) + model.save_quantized(quantized_model_dir) + +Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``. +Finally, you can load the quantized model checkpoint directly in vLLM. + +.. code-block:: python + + from vllm import LLM + model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") + # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB + result = model.generate("Hello, my name is") + diff --git a/vllm-v0.6.2/docs/source/quantization/fp8_e4m3_kvcache.rst b/vllm-v0.6.2/docs/source/quantization/fp8_e4m3_kvcache.rst new file mode 100644 index 0000000..cc52d8f --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/fp8_e4m3_kvcache.rst @@ -0,0 +1,47 @@ +.. _fp8_e4m3_kvcache: + +FP8 E4M3 KV Cache +================== + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, +improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 +(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of +the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of +FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside +each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling +factors of a finer granularity (e.g. per-channel). + +These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If +this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an +unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). + +To install AMMO (AlgorithMic Model Optimization): + +.. code-block:: console + + $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon +offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. +Thus, LLM inference is greatly accelerated with minimal accuracy loss. + + +Here is an example of how to enable this feature: + +.. code-block:: python + + # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to + # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. + + from vllm import LLM, SamplingParams + sampling_params = SamplingParams(temperature=1.3, top_p=0.8) + llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + prompt = "London is the capital of" + out = llm.generate(prompt, sampling_params)[0].outputs[0].text + print(out) + + # output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, + # output w/o scaling factors: England, located in the southeastern part of the country. It is known + diff --git a/vllm-v0.6.2/docs/source/quantization/fp8_e5m2_kvcache.rst b/vllm-v0.6.2/docs/source/quantization/fp8_e5m2_kvcache.rst new file mode 100644 index 0000000..9ae07bc --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/fp8_e5m2_kvcache.rst @@ -0,0 +1,34 @@ +.. _fp8_kv_cache: + +FP8 E5M2 KV Cache +================== + +The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. +The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. + +Here is an example of how to enable this feature: + +.. code-block:: python + + from vllm import LLM, SamplingParams + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + # Create an LLM. + llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + diff --git a/vllm-v0.6.2/docs/source/quantization/gguf.rst b/vllm-v0.6.2/docs/source/quantization/gguf.rst new file mode 100644 index 0000000..9f00dc5 --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/gguf.rst @@ -0,0 +1,73 @@ +.. _gguf: + +GGUF +================== + +.. warning:: + + Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. + +.. warning:: + + Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split `_ tool to merge them to a single-file model. + +To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF `_ with the following command: + +.. code-block:: console + + $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf + $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. + $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 + +You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs: + +.. code-block:: console + + $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. + $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 + +.. warning:: + + We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. + +You can also use the GGUF model directly through the LLM entrypoint: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + # In this script, we demonstrate how to pass input to the chat method: + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.chat(conversation, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/docs/source/quantization/int8.rst b/vllm-v0.6.2/docs/source/quantization/int8.rst new file mode 100644 index 0000000..04fa308 --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/int8.rst @@ -0,0 +1,145 @@ +.. _int8: + +INT8 W8A8 +================== + +vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. +This quantization method is particularly useful for reducing model size while maintaining good performance. + +Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM `_. + +.. note:: + + INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). + +Prerequisites +------------- + +To use INT8 quantization with vLLM, you'll need to install the `llm-compressor `_ library: + +.. code-block:: console + + $ pip install llmcompressor==0.1.0 + +Quantization Process +-------------------- + +The quantization process involves four main steps: + +1. Loading the model +2. Preparing calibration data +3. Applying quantization +4. Evaluating accuracy in vLLM + +1. Loading the Model +^^^^^^^^^^^^^^^^^^^^ + +Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: + +.. code-block:: python + + from llmcompressor.transformers import SparseAutoModelForCausalLM + from transformers import AutoTokenizer + + MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", + ) + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +2. Preparing Calibration Data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When quantizing activations to INT8, you need sample data to estimate the activation scales. +It's best to use calibration data that closely matches your deployment data. +For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``: + +.. code-block:: python + + from datasets import load_dataset + + NUM_CALIBRATION_SAMPLES = 512 + MAX_SEQUENCE_LENGTH = 2048 + + # Load and preprocess the dataset + ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") + ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} + ds = ds.map(preprocess) + + def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) + ds = ds.map(tokenize, remove_columns=ds.column_names) + +3. Applying Quantization +^^^^^^^^^^^^^^^^^^^^^^^^ + +Now, apply the quantization algorithms: + +.. code-block:: python + + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import GPTQModifier + from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + + # Configure the quantization algorithms + recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), + ] + + # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + ) + + # Save the compressed model + SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + +This process creates a W8A8 model with weights and activations quantized to 8-bit integers. + +4. Evaluating Accuracy +^^^^^^^^^^^^^^^^^^^^^^ + +After quantization, you can load and run the model in vLLM: + +.. code-block:: python + + from vllm import LLM + model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") + +To evaluate accuracy, you can use ``lm_eval``: + +.. code-block:: console + + $ lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' + +.. note:: + + Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations. + +Best Practices +-------------- + +- Start with 512 samples for calibration data (increase if accuracy drops) +- Use a sequence length of 2048 as a starting point +- Employ the chat template or instruction template that the model was trained with +- If you've fine-tuned a model, consider using a sample of your training data for calibration + +Troubleshooting and Support +--------------------------- + +If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/quantization/supported_hardware.rst b/vllm-v0.6.2/docs/source/quantization/supported_hardware.rst new file mode 100644 index 0000000..9bf0cdb --- /dev/null +++ b/vllm-v0.6.2/docs/source/quantization/supported_hardware.rst @@ -0,0 +1,132 @@ +.. _supported_hardware_for_quantization: + +Supported Hardware for Quantization Kernels +=========================================== + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +.. list-table:: + :header-rows: 1 + :widths: 20 8 8 8 8 8 8 8 8 8 8 + + * - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU + * - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ + * - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ + * - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + * - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + +Notes: +^^^^^^ + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅︎" indicates that the quantization method is supported on the specified hardware. +- "✗" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/serving/compatibility_matrix.rst b/vllm-v0.6.2/docs/source/serving/compatibility_matrix.rst new file mode 100644 index 0000000..f629b3c --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/compatibility_matrix.rst @@ -0,0 +1,427 @@ +.. _compatibility_matrix: + +Compatibility Matrix +==================== + +The tables below show mutually exclusive features and the support on some hardware. + +.. note:: + + Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. + +Feature x Feature +----------------- + + +.. raw:: html + + + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - :ref:`CP ` + - :ref:`APC ` + - :ref:`LoRA ` + - :abbr:`prmpt adptr (Prompt Adapter)` + - :ref:`SD ` + - CUDA graph + - :abbr:`enc-dec (Encoder-Decoder Models)` + - :abbr:`logP (Logprobs)` + - :abbr:`prmpt logP (Prompt Logprobs)` + - :abbr:`async output (Async Output Processing)` + - multi-step + - :abbr:`MM (Multimodal)` + - best-of + - beam-search + - :abbr:`guided dec (Guided Decoding)` + * - :ref:`CP ` + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`APC ` + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`LoRA ` + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`SD ` + - ✗ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✗ + - `✗ `__ + - ✗ + - ✗ + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - `✗ `__ + - ✅ + - + - + - + - + - + * - :abbr:`MM (Multimodal)` + - `✗ `__ + - `✗ `__ + - `✗ `__ + - ? + - ? + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ? + - ✅ + - + - + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ? + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ? + - ✅ + - ✅ + - + + +Feature x Hardware +^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - :ref:`CP ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`APC ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`LoRA ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :ref:`SD ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`MM (Multimodal)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_bentoml.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_bentoml.rst new file mode 100644 index 0000000..4b9d19f --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_bentoml.rst @@ -0,0 +1,8 @@ +.. _deploying_with_bentoml: + +Deploying with BentoML +====================== + +`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_cerebrium.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_cerebrium.rst new file mode 100644 index 0000000..9585b6e --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_cerebrium.rst @@ -0,0 +1,112 @@ +.. _deploying_with_cerebrium: + +Deploying with Cerebrium +============================ + +.. raw:: html + +

+ vLLM_plus_cerebrium +

+ +vLLM can be run on a cloud based GPU machine with `Cerebrium `__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. + +To install the Cerebrium client, run: + +.. code-block:: console + + $ pip install cerebrium + $ cerebrium login + +Next, create your Cerebrium project, run: + +.. code-block:: console + + $ cerebrium init vllm-project + +Next, to install the required packages, add the following to your cerebrium.toml: + +.. code-block:: toml + + [cerebrium.deployment] + docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + + [cerebrium.dependencies.pip] + vllm = "latest" + +Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + + def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) + + return {"results": results} + + +Then, run the following code to deploy it to the cloud + +.. code-block:: console + + $ cerebrium deploy + +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) + +.. code-block:: python + + curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' + +You should get a response like: + +.. code-block:: python + + { + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 + } + +You now have an autoscaling endpoint where you only pay for the compute you use! + diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_docker.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_docker.rst new file mode 100644 index 0000000..14d94b0 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_docker.rst @@ -0,0 +1,53 @@ +.. _deploying_with_docker: + +Deploying with Docker +============================ + +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. + +.. code-block:: console + + $ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 + + +.. note:: + + You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the + container to access the host's shared memory. vLLM uses PyTorch, which uses shared + memory to share data between processes under the hood, particularly for tensor parallel inference. + + +You can build and run vLLM from source via the provided `Dockerfile `_. To build vLLM: + +.. code-block:: console + + $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 + + +.. note:: + + By default vLLM will build for all GPU types for widest distribution. If you are just building for the + current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` + for vLLM to find the current GPU type and build for that. + + +To run vLLM: + +.. code-block:: console + + $ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + vllm/vllm-openai + +.. note:: + + **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_dstack.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_dstack.rst new file mode 100644 index 0000000..e1eb45b --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_dstack.rst @@ -0,0 +1,103 @@ +.. _deploying_with_dstack: + +Deploying with dstack +============================ + +.. raw:: html + +

+ vLLM_plus_dstack +

+ +vLLM can be run on a cloud based GPU machine with `dstack `__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. + +To install dstack client, run: + +.. code-block:: console + + $ pip install "dstack[all] + $ dstack server + +Next, to configure your dstack project, run: + +.. code-block:: console + + $ mkdir -p vllm-dstack + $ cd vllm-dstack + $ dstack init + +Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: + +.. code-block:: yaml + + type: service + + python: "3.11" + env: + - MODEL=NousResearch/Llama-2-7b-chat-hf + port: 8000 + resources: + gpu: 24GB + commands: + - pip install vllm + - vllm serve $MODEL --port 8000 + model: + format: openai + type: chat + name: NousResearch/Llama-2-7b-chat-hf + +Then, run the following CLI for provisioning: + +.. code-block:: console + + $ dstack run . -f serve.dstack.yml + + ⠸ Getting run plan... + Configuration serve.dstack.yml + Project deep-diver-main + User deep-diver + Min resources 2..xCPU, 8GB.., 1xGPU (24GB) + Max price - + Max duration - + Spot policy auto + Retry policy no + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + ... + Shown 3 of 193 offers, $5.876 max + + Continue? [y/n]: y + ⠙ Submitting run... + ⠏ Launching spicy-treefrog-1 (pulling) + spicy-treefrog-1 provisioning completed (running) + Service is published at ... + +After the provisioning, you can interact with the model by using the OpenAI SDK: + +.. code-block:: python + + from openai import OpenAI + + client = OpenAI( + base_url="https://gateway.", + api_key="" + ) + + completion = client.chat.completions.create( + model="NousResearch/Llama-2-7b-chat-hf", + messages=[ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + } + ] + ) + + print(completion.choices[0].message.content) + +.. note:: + + dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository `__ diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_k8s.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_k8s.rst new file mode 100644 index 0000000..7dc076d --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_k8s.rst @@ -0,0 +1,175 @@ +.. _deploying_with_k8s: + +Deploying with Kubernetes +========================== + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +Prerequisites +------------- +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +Deployment Steps +---------------- + +1. **Create a PVC , Secret and Deployment for vLLM** + + +PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + +.. code-block:: yaml + + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: mistral-7b + namespace: default + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem + +Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + +.. code-block:: yaml + + apiVersion: v1 + kind: Secret + metadata: + name: hf-token-secret + namespace: default + type: Opaque + data: + token: "REPLACE_WITH_TOKEN" + + +Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: + +.. code-block:: yaml + + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 + +2. **Create a Kubernetes Service for vLLM** + +Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + +.. code-block:: yaml + + apiVersion: v1 + kind: Service + metadata: + name: mistral-7b + namespace: default + spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP + +3. **Deploy and Test** + +Apply the deployment and service configurations using ``kubectl apply -f ``: + +.. code-block:: console + + kubectl apply -f deployment.yaml + kubectl apply -f service.yaml + +To test the deployment, run the following ``curl`` command: + +.. code-block:: console + + curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + +If the service is correctly deployed, you should receive a response from the vLLM model. + +Conclusion +---------- +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. \ No newline at end of file diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_kserve.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_kserve.rst new file mode 100644 index 0000000..01d7ccc --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_kserve.rst @@ -0,0 +1,8 @@ +.. _deploying_with_kserve: + +Deploying with KServe +============================ + +vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. + +Please see `this guide `_ for more details on using vLLM with KServe. diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_lws.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_lws.rst new file mode 100644 index 0000000..b63a432 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_lws.rst @@ -0,0 +1,12 @@ +.. _deploying_with_lws: + +Deploying with LWS +============================ + +LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. +A major use case is for multi-host/multi-node distributed inference. + +vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. + +Please see `this guide `_ for more details on +deploying vLLM on Kubernetes using LWS. diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_nginx.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_nginx.rst new file mode 100644 index 0000000..b5dff02 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_nginx.rst @@ -0,0 +1,142 @@ +.. _nginxloadbalancer: + +Deploying with Nginx Loadbalancer +================================= + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +#. :ref:`Build Nginx Container ` +#. :ref:`Create Simple Nginx Config file ` +#. :ref:`Build vLLM Container ` +#. :ref:`Create Docker Network ` +#. :ref:`Launch vLLM Containers ` +#. :ref:`Launch Nginx ` +#. :ref:`Verify That vLLM Servers Are Ready ` + +.. _nginxloadbalancer_nginx_build: + +Build Nginx Container +--------------------- + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +.. code-block:: console + + export vllm_root=`pwd` + +Create a file named ``Dockerfile.nginx``: + +.. code-block:: console + + FROM nginx:latest + RUN rm /etc/nginx/conf.d/default.conf + EXPOSE 80 + CMD ["nginx", "-g", "daemon off;"] + +Build the container: + +.. code-block:: console + + docker build . -f Dockerfile.nginx --tag nginx-lb + +.. _nginxloadbalancer_nginx_conf: + +Create Simple Nginx Config file +------------------------------- + +Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. + +.. code-block:: console + + upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; + } + server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + +.. _nginxloadbalancer_nginx_vllm_container: + +Build vLLM Container +-------------------- + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm + + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy + +.. _nginxloadbalancer_nginx_docker_network: + +Create Docker Network +--------------------- + +.. code-block:: console + + docker network create vllm_nginx + + +.. _nginxloadbalancer_nginx_launch_container: + +Launch vLLM Containers +---------------------- + +Notes: + +* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. +* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. +* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. +* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. + +.. code-block:: console + + mkdir -p ~/.cache/huggingface/hub/ + hf_cache_dir=~/.cache/huggingface/ + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf + +.. note:: + If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. + +.. _nginxloadbalancer_nginx_launch_nginx: + +Launch Nginx +------------ + +.. code-block:: console + + docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest + +.. _nginxloadbalancer_nginx_verify_nginx: + +Verify That vLLM Servers Are Ready +---------------------------------- + +.. code-block:: console + + docker logs vllm0 | grep Uvicorn + docker logs vllm1 | grep Uvicorn + +Both outputs should look like this: + +.. code-block:: console + + INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/vllm-v0.6.2/docs/source/serving/deploying_with_triton.rst b/vllm-v0.6.2/docs/source/serving/deploying_with_triton.rst new file mode 100644 index 0000000..5ce7c3d --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/deploying_with_triton.rst @@ -0,0 +1,6 @@ +.. _deploying_with_triton: + +Deploying with NVIDIA Triton +============================ + +The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. diff --git a/vllm-v0.6.2/docs/source/serving/distributed_serving.rst b/vllm-v0.6.2/docs/source/serving/distributed_serving.rst new file mode 100644 index 0000000..4d57206 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/distributed_serving.rst @@ -0,0 +1,107 @@ +.. _distributed_serving: + +Distributed Inference and Serving +================================= + +How to decide the distributed inference strategy? +------------------------------------------------- + +Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: + +- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. +- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. +- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. + +In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. + +After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. + +.. note:: + There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. + +Details for Distributed Inference and Serving +---------------------------------------------- + +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. + +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. + +To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: + +.. code-block:: python + + from vllm import LLM + llm = LLM("facebook/opt-13b", tensor_parallel_size=4) + output = llm.generate("San Franciso is a") + +To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: + +.. code-block:: console + + $ vllm serve facebook/opt-13b \ + $ --tensor-parallel-size 4 + +You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: + +.. code-block:: console + + $ vllm serve gpt2 \ + $ --tensor-parallel-size 4 \ + $ --pipeline-parallel-size 2 + +Multi-Node Inference and Serving +-------------------------------- + +If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. + +The first step, is to start containers and organize them into a cluster. We have provided a helper `script `_ to start the cluster. + +Pick a node as the head node, and run the following command: + +.. code-block:: console + + $ bash run_cluster.sh \ + $ vllm/vllm-openai \ + $ ip_of_head_node \ + $ --head \ + $ /path/to/the/huggingface/home/in/this/node + +On the rest of the worker nodes, run the following command: + +.. code-block:: console + + $ bash run_cluster.sh \ + $ vllm/vllm-openai \ + $ ip_of_head_node \ + $ --worker \ + $ /path/to/the/huggingface/home/in/this/node + +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. + +Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. + +After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: + +.. code-block:: console + + $ vllm serve /path/to/the/model/in/the/container \ + $ --tensor-parallel-size 8 \ + $ --pipeline-parallel-size 2 + +You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: + +.. code-block:: console + + $ vllm serve /path/to/the/model/in/the/container \ + $ --tensor-parallel-size 16 + +To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. + +.. warning:: + After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion `_ for more information. + +.. warning:: + + Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. + + When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model. diff --git a/vllm-v0.6.2/docs/source/serving/env_vars.rst b/vllm-v0.6.2/docs/source/serving/env_vars.rst new file mode 100644 index 0000000..ff2259c --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/env_vars.rst @@ -0,0 +1,14 @@ +Environment Variables +======================== + +vLLM uses the following environment variables to configure the system: + +.. warning:: + Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work. + + All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix `_. + +.. literalinclude:: ../../../vllm/envs.py + :language: python + :start-after: begin-env-vars-definition + :end-before: end-env-vars-definition diff --git a/vllm-v0.6.2/docs/source/serving/faq.rst b/vllm-v0.6.2/docs/source/serving/faq.rst new file mode 100644 index 0000000..9e858e6 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/faq.rst @@ -0,0 +1,31 @@ +Frequently Asked Questions +=========================== + + Q: How can I serve multiple models on a single port using the OpenAI API? + +A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. + +---------------------------------------- + + Q: Which model to use for offline inference embedding? + +A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model + +---------------------------------------- + + Q: Can the output of a prompt vary across runs in vLLM? + +A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to +numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, +see the `Numerical Accuracy section `_. + +In vLLM, the same requests might be batched differently due to factors such as other concurrent requests, +changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, +can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in +different tokens being sampled. Once a different token is sampled, further divergence is likely. + +**Mitigation Strategies** + +- For improved stability and reduced variance, use `float32`. Note that this will require more memory. +- If using `bfloat16`, switching to `float16` can also help. +- Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur. diff --git a/vllm-v0.6.2/docs/source/serving/integrations.rst b/vllm-v0.6.2/docs/source/serving/integrations.rst new file mode 100644 index 0000000..f39997e --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/integrations.rst @@ -0,0 +1,16 @@ +Integrations +------------ + +.. toctree:: + :maxdepth: 1 + + run_on_sky + deploying_with_kserve + deploying_with_triton + deploying_with_bentoml + deploying_with_cerebrium + deploying_with_lws + deploying_with_dstack + serving_with_langchain + serving_with_llamaindex + serving_with_llamastack diff --git a/vllm-v0.6.2/docs/source/serving/metrics.rst b/vllm-v0.6.2/docs/source/serving/metrics.rst new file mode 100644 index 0000000..15e57bd --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/metrics.rst @@ -0,0 +1,13 @@ +Production Metrics +================== + +vLLM exposes a number of metrics that can be used to monitor the health of the +system. These metrics are exposed via the `/metrics` endpoint on the vLLM +OpenAI compatible API server. + +The following metrics are exposed: + +.. literalinclude:: ../../../vllm/engine/metrics.py + :language: python + :start-after: begin-metrics-definitions + :end-before: end-metrics-definitions diff --git a/vllm-v0.6.2/docs/source/serving/openai_compatible_server.md b/vllm-v0.6.2/docs/source/serving/openai_compatible_server.md new file mode 100644 index 0000000..7896581 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/openai_compatible_server.md @@ -0,0 +1,430 @@ +# OpenAI Compatible Server + +vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. + +You can start the server using Python, or using [Docker](deploying_with_docker.rst): +```bash +vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 +``` + +To call the server, you can use the official OpenAI Python client library, or any other HTTP client. +```python +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", +) + +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Hello!"} + ] +) + +print(completion.choices[0].message) +``` + +## API Reference + +We currently support the following OpenAI APIs: + +- [Completions API](https://platform.openai.com/docs/api-reference/completions) + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst). + - *Note: `image_url.detail` parameter is not supported.* + - We also support `audio_url` content type for audio files. + - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. + - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) + - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), + which will be treated as a single prompt to the model according to its chat template. + - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). + - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* + +## Extra Parameters + +vLLM supports a set of parameters that are not part of the OpenAI API. +In order to use them, you can pass them as extra parameters in the OpenAI client. +Or directly merge them into the JSON payload if you are using HTTP call directly. + +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={ + "guided_choice": ["positive", "negative"] + } +) +``` + +### Extra HTTP Headers + +Only `X-Request-Id` HTTP request header is supported for now. + +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_headers={ + "x-request-id": "sentiment-classification-00001", + } +) +print(completion._request_id) + +completion = client.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + prompt="A robot may not injure a human being", + extra_headers={ + "x-request-id": "completion-test", + } +) +print(completion._request_id) +``` + +### Extra Parameters for Completions API + +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + +### Extra Parameters for Chat Completions API + +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-completion-sampling-params +:end-before: end-chat-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-completion-extra-params +:end-before: end-chat-completion-extra-params +``` + +### Extra Parameters for Embeddings API + +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-embedding-pooling-params +:end-before: end-embedding-pooling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-embedding-extra-params +:end-before: end-embedding-extra-params +``` + +## Chat Template + +In order for the language model to support chat protocol, vLLM requires the model to include +a chat template in its tokenizer configuration. The chat template is a Jinja2 template that +specifies how are roles, messages, and other chat-specific tokens are encoded in the input. + +An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models) + +Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model, +you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat +template, or the template in string form. Without a chat template, the server will not be able to process chat +and all chat requests will error. + +```bash +vllm serve --chat-template ./path-to-chat-template.jinja +``` + +vLLM community provides a set of chat templates for popular models. You can find them in the examples +directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) + +With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies +both a `type` and a `text` field. An example is provided below: +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} + ] +) +``` +Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which +format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify +between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match +this, unless explicitly specified. + + +## Command line arguments for the server + +```{argparse} +:module: vllm.entrypoints.openai.cli_args +:func: create_parser_for_docs +:prog: vllm serve +``` + + +### Config file + +The `serve` module can also accept arguments from a config file in +`yaml` format. The arguments in the yaml must be specified using the +long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): + +For example: + +```yaml +# config.yaml + +host: "127.0.0.1" +port: 6379 +uvicorn-log-level: "info" +``` + +```bash +$ vllm serve SOME_MODEL --config config.yaml +``` +--- +**NOTE** +In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence. +The order of priorities is `command line > config file values > defaults`. + +--- + +## Tool calling in the chat completion API +vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap. + +It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. +Please see below for recommended configuration and chat templates to use when function calling is to be used with the different models. + + +### Named Function Calling +vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is +enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a +high-quality one. + +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. + +To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and +specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. + + +### Automatic Function Calling +To enable this feature, you should set the following flags: +* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it +deems appropriate. +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers +will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. +* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. +* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their +`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat +template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) +from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json) + +If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! + + +#### Hermes Models (`hermes`) + +All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. +* `NousResearch/Hermes-2-Pro-*` +* `NousResearch/Hermes-2-Theta-*` +* `NousResearch/Hermes-3-*` + + +_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge +step in their creation_. + +Flags: `--tool-call-parser hermes` + + +#### Mistral Models (`mistral`) + +Supported models: +* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) +* Additional mistral function-calling models are compatible as well. + +Known issues: +1. Mistral 7B struggles to generate parallel tool calls correctly. +2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is +much shorter than what vLLM generates. Since an exception is thrown when this condition +is not met, the following additional chat templates are provided: + +* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that +it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) +* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt +when tools are provided, that results in much better reliability when working with parallel tool calling. + + +Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + + +#### Llama Models (`llama3_json`) + +Supported models: +* `meta-llama/Meta-Llama-3.1-8B-Instruct` +* `meta-llama/Meta-Llama-3.1-70B-Instruct` +* `meta-llama/Meta-Llama-3.1-405B-Instruct` +* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` + +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below. +Other tool calling formats like the built in python tool calling or custom tool calling are not supported. + +Known issues: +1. Parallel tool calls are not supported. +2. The model can generate parameters with a wrong format, such as generating + an array serialized as string instead of an array. + +The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that +it works better with vLLM. + +Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` + +#### IBM Granite + +Supported models: +* `ibm-granite/granite-3.0-8b-instruct` + +Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` + +`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported. + +* `ibm-granite/granite-20b-functioncalling` + +Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + +`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + + +#### InternLM Models (`internlm`) + +Supported models: +* `internlm/internlm2_5-7b-chat` (confirmed) +* Additional internlm2.5 function-calling models are compatible as well + +Known issues: +* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. + +Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` + + +#### Jamba Models (`jamba`) +AI21's Jamba-1.5 models are supported. +* `ai21labs/AI21-Jamba-1.5-Mini` +* `ai21labs/AI21-Jamba-1.5-Large` + + +Flags: `--tool-call-parser jamba` + + +#### Models with Pythonic Tool Calls (`pythonic`) + +A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. + +As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: +```python +[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] +``` + +Limitations: +* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) +* Llama's smaller models struggle to use tools effectively. + +Example supported models: +* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) + +Flags: `--tool-call-parser pythonic --chat-template {see_above}` + +--- +**WARNING** +Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary. + +--- + + +### How to write a tool parser plugin + +A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. + +Here is a summary of a plugin file: + +```python + +# import the required packages + +# define a tool parser and register it to vllm +# the name list in register_module can be used +# in --tool-call-parser. you can define as many +# tool parsers as you want here. +@ToolParserManager.register_module(["example"]) +class ExampleToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + # adjust request. e.g.: set skip special tokens + # to False for tool call output. + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + return request + + # implement the tool call parse for stream call + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + return delta + + # implement the tool parse for non-stream call + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=text) + + +``` + +Then you can use this plugin in the command line like this. +``` + --enable-auto-tool-choice \ + --tool-parser-plugin + --tool-call-parser example \ + --chat-template \ +``` + diff --git a/vllm-v0.6.2/docs/source/serving/run_on_sky.rst b/vllm-v0.6.2/docs/source/serving/run_on_sky.rst new file mode 100644 index 0000000..227e6fd --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/run_on_sky.rst @@ -0,0 +1,366 @@ +.. _on_cloud: + +Deploying and scaling up with SkyPilot +================================================ + +.. raw:: html + +

+ vLLM +

+ +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. + + +Prerequisites +------------- + +- Go to the `HuggingFace model page `__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`. +- Check that you have installed SkyPilot (`docs `__). +- Check that :code:`sky check` shows clouds or Kubernetes are enabled. + +.. code-block:: console + + pip install skypilot-nightly + sky check + + +Run on a single instance +------------------------ + +See the vLLM SkyPilot YAML for serving, `serving.yaml `__. + +.. code-block:: yaml + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 + +Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN + +Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. + +.. code-block:: console + + (task, pid=7431) Running on public URL: https://.gradio.live + +**Optional**: Serve the 70B model instead of the default 8B and use more GPU: + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct + + +Scale up to multiple replicas +----------------------------- + +SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. + +.. code-block:: yaml + + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +.. raw:: html + +
+ Click to see the full recipe YAML + + +.. code-block:: yaml + + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + +.. raw:: html + +
+ +Start the serving the Llama-3 8B model on multiple replicas: + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN + + +Wait until the service is ready: + +.. code-block:: console + + watch -n10 sky serve status vllm + + +.. raw:: html + +
+ Example outputs: + +.. code-block:: console + + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + + Service Replicas + SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION + vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 + vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 + +.. raw:: html + +
+ +After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: + +.. code-block:: console + + ENDPOINT=$(sky serve status --endpoint 8081 vllm) + curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' + +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: + +.. code-block:: yaml + + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + +This will scale the service up to when the QPS exceeds 2 for each replica. + + +.. raw:: html + +
+ Click to see the full recipe YAML + + +.. code-block:: yaml + + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + + +.. raw:: html + +
+ +To update the service with the new config: + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN + + +To stop the service: + +.. code-block:: console + + sky serve down vllm + + +**Optional**: Connect a GUI to the endpoint +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. + +.. raw:: html + +
+ Click to see the full GUI YAML + +.. code-block:: yaml + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + + resources: + cpus: 2 + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + + run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log + + +.. raw:: html + +
+ +1. Start the chat web UI: + +.. code-block:: console + + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + + +2. Then, we can access the GUI at the returned gradio link: + +.. code-block:: console + + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + + diff --git a/vllm-v0.6.2/docs/source/serving/serving_with_langchain.rst b/vllm-v0.6.2/docs/source/serving/serving_with_langchain.rst new file mode 100644 index 0000000..6440c8a --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/serving_with_langchain.rst @@ -0,0 +1,31 @@ +.. _run_on_langchain: + +Serving with Langchain +============================ + +vLLM is also available via `Langchain `_ . + +To install langchain, run + +.. code-block:: console + + $ pip install langchain langchain_community -q + +To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. + +.. code-block:: python + + from langchain_community.llms import VLLM + + llm = VLLM(model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # tensor_parallel_size=... # for distributed inference + ) + + print(llm("What is the capital of France ?")) + +Please refer to this `Tutorial `_ for more details. diff --git a/vllm-v0.6.2/docs/source/serving/serving_with_llamaindex.rst b/vllm-v0.6.2/docs/source/serving/serving_with_llamaindex.rst new file mode 100644 index 0000000..038e961 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/serving_with_llamaindex.rst @@ -0,0 +1,27 @@ +.. _run_on_llamaindex: + +Serving with llama_index +============================ + +vLLM is also available via `llama_index `_ . + +To install llamaindex, run + +.. code-block:: console + + $ pip install llama-index-llms-vllm -q + +To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. + +.. code-block:: python + + from llama_index.llms.vllm import Vllm + + llm = Vllm( + model="microsoft/Orca-2-7b", + tensor_parallel_size=4, + max_new_tokens=100, + vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, + ) + +Please refer to this `Tutorial `_ for more details. diff --git a/vllm-v0.6.2/docs/source/serving/serving_with_llamastack.rst b/vllm-v0.6.2/docs/source/serving/serving_with_llamastack.rst new file mode 100644 index 0000000..8ef96c4 --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/serving_with_llamastack.rst @@ -0,0 +1,42 @@ +.. _run_on_llamastack: + +Serving with Llama Stack +============================ + +vLLM is also available via `Llama Stack `_ . + +To install Llama Stack, run + +.. code-block:: console + + $ pip install llama-stack -q + +Inference using OpenAI Compatible API +------------------------------------- + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +.. code-block:: yaml + + inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 + +Please refer to `this guide `_ for more details on this remote vLLM provider. + +Inference via Embedded vLLM +--------------------------- + +An `inline vLLM provider +`_ +is also available. This is a sample of configuration using that method: + +.. code-block:: yaml + + inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 diff --git a/vllm-v0.6.2/docs/source/serving/tensorizer.rst b/vllm-v0.6.2/docs/source/serving/tensorizer.rst new file mode 100644 index 0000000..96a93db --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/tensorizer.rst @@ -0,0 +1,15 @@ +.. _tensorizer: + +Loading Models with CoreWeave's Tensorizer +========================================== +vLLM supports loading models with `CoreWeave's Tensorizer `_. +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the `vLLM example script `_. + +.. note:: + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/vllm-v0.6.2/docs/source/serving/usage_stats.md b/vllm-v0.6.2/docs/source/serving/usage_stats.md new file mode 100644 index 0000000..a1e4b1c --- /dev/null +++ b/vllm-v0.6.2/docs/source/serving/usage_stats.md @@ -0,0 +1,57 @@ +# Usage Stats Collection + +vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit. + +## What data is collected? + +You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py). + +Here is an example as of v0.4.0: + +```json +{ + "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", + "provider": "GCP", + "num_cpu": 24, + "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz", + "cpu_family_model_stepping": "6,85,7", + "total_memory": 101261135872, + "architecture": "x86_64", + "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31", + "gpu_count": 2, + "gpu_type": "NVIDIA L4", + "gpu_memory_per_device": 23580639232, + "model_architecture": "OPTForCausalLM", + "vllm_version": "0.3.2+cu123", + "context": "LLM_CLASS", + "log_time": 1711663373492490000, + "source": "production", + "dtype": "torch.float16", + "tensor_parallel_size": 1, + "block_size": 16, + "gpu_memory_utilization": 0.9, + "quantization": null, + "kv_cache_dtype": "auto", + "enable_lora": false, + "enable_prefix_caching": false, + "enforce_eager": false, + "disable_custom_all_reduce": true +} +``` + +You can preview the collected data by running the following command: + +```bash +tail ~/.config/vllm/usage_stats.json +``` + +## Opt-out of Usage Stats Collection + +You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file: + +```bash +# Any of the following methods can disable usage stats collection +export VLLM_NO_USAGE_STATS=1 +export DO_NOT_TRACK=1 +mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track +``` diff --git a/vllm-v0.6.2/examples/__init__.py b/vllm-v0.6.2/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm-v0.6.2/examples/api_client.py b/vllm-v0.6.2/examples/api_client.py new file mode 100644 index 0000000..aa8e56e --- /dev/null +++ b/vllm-v0.6.2/examples/api_client.py @@ -0,0 +1,82 @@ +"""Example Python client for vllm.entrypoints.api_server +server command: +python -m vllm.entrypoints.api_server --model ${MODEL_PATH} --swap-space 16 --disable-log-requests --port 8000 +""" + +import argparse +import json +from typing import Iterable, List + +import requests + + +def clear_line(n: int = 1) -> None: + LINE_UP = '\033[1A' + LINE_CLEAR = '\x1b[2K' + for _ in range(n): + print(LINE_UP, end=LINE_CLEAR, flush=True) + + +def post_http_request(prompt: str, + api_url: str, + n: int = 1, + stream: bool = False) -> requests.Response: + headers = {"User-Agent": "Test Client"} + pload = { + "prompt": prompt, + "n": n, + "temperature": 0.0, + "max_tokens": 16, + "stream": stream, + } + response = requests.post(api_url, + headers=headers, + json=pload, + stream=stream) + return response + + +def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: + for chunk in response.iter_lines(chunk_size=8192, + decode_unicode=False, + delimiter=b"\0"): + if chunk: + data = json.loads(chunk.decode("utf-8")) + output = data["text"] + yield output + + +def get_response(response: requests.Response) -> List[str]: + data = json.loads(response.content) + output = data["text"] + return output + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--n", type=int, default=1) + parser.add_argument("--prompt", type=str, default="San Francisco is a") + parser.add_argument("--stream", action="store_true") + args = parser.parse_args() + prompt = args.prompt + api_url = f"http://{args.host}:{args.port}/generate" + n = args.n + stream = args.stream + + print(f"Prompt: {prompt!r}\n", flush=True) + response = post_http_request(prompt, api_url, n, stream) + + if stream: + num_printed_lines = 0 + for h in get_streaming_response(response): + clear_line(num_printed_lines) + num_printed_lines = 0 + for i, line in enumerate(h): + num_printed_lines += 1 + print(f"Beam candidate {i}: {line!r}", flush=True) + else: + output = get_response(response) + for i, line in enumerate(output): + print(f"Beam candidate {i}: {line!r}", flush=True) diff --git a/vllm-v0.6.2/examples/aqlm_example.py b/vllm-v0.6.2/examples/aqlm_example.py new file mode 100644 index 0000000..40f9a21 --- /dev/null +++ b/vllm-v0.6.2/examples/aqlm_example.py @@ -0,0 +1,45 @@ +from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser + + +def main(): + + parser = FlexibleArgumentParser(description='AQLM examples') + + parser.add_argument('--model', + '-m', + type=str, + default=None, + help='model path, as for HF') + parser.add_argument('--choice', + '-c', + type=int, + default=0, + help='known good models by index, [0-4]') + parser.add_argument('--tensor-parallel-size', + '-t', + type=int, + default=1, + help='tensor parallel size') + + args = parser.parse_args() + + models = [ + "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", + "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf", + "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf", + "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", + "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", + ] + + model = LLM(args.model if args.model is not None else models[args.choice], + tensor_parallel_size=args.tensor_parallel_size) + + sampling_params = SamplingParams(max_tokens=100, temperature=0) + outputs = model.generate("Hello my name is", + sampling_params=sampling_params) + print(outputs[0].outputs[0].text) + + +if __name__ == '__main__': + main() diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/README.md b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/README.md new file mode 100644 index 0000000..f6d00ad --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/README.md @@ -0,0 +1,48 @@ +# 背景 + +此示例用于在vLLM中演示chunked parallel pipeline功能,通过mlu_hijck机制将需要修改的代码劫持到当前目录,避免修改主仓库代码。 + +# 支持模型 + +- LlamaForCausalLM +- CustomForCausalLM + +# Demo运行方式 + +当前Chunked Parallel Pipeline仅支持通过AsyncLLMEngine方式用paged mode运行。 + +- 设置环境变量 + +```bash +export CHUNKED_PIPELINE_PARALLEL_EN=true +``` + +- 启动server进程 +```bash +# 设置engine超时阈值。 +export VLLM_ENGINE_ITERATION_TIMEOUT_S=180 + +python -m vllm.entrypoints.openai.api_server \ + --port ${PORT} \ + --model ${MODEL_PATH} \ + --swap-space 16 \ + --pipeline-parallel-size ${PP_SIZE} \ + --max-num-batched-tokens ${MAX_TOKENS_NUM} \ + --enable-chunked-prefill \ + --worker-use-ray \ + --enforce-eager +``` + +- 启动client进程 +这里以随机数为例,可以选用真实数据集。 +```bash +python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model ${MODEL_PATH} \ + --dataset-name random \ + --num-prompts ${NUM_PROMPT} \ + --port ${PORT} \ + --random-input-len ${INPUT_LEN} \ + --random-output-len 1 \ + --request-rate inf +``` diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/distributed/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/distributed/__init__.py new file mode 100644 index 0000000..dbf4c32 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/distributed/__init__.py @@ -0,0 +1 @@ +from . import parallel_state diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/distributed/parallel_state.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/distributed/parallel_state.py new file mode 100644 index 0000000..15f7fb7 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/distributed/parallel_state.py @@ -0,0 +1,223 @@ +# Copyright 2023 The vLLM team. +# Adapted from +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +"""vLLM distributed state. +It takes over the control of the distributed environment from PyTorch. +The typical workflow is: + +- call `init_distributed_environment` to initialize the distributed environment. +- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to + initialize the model parallel groups. + +- any code dealing with the distributed stuff + +- call `destroy_model_parallel` to destroy the model parallel groups. +- call `destroy_distributed_environment` to destroy the distributed environment. + +If you only need to use the distributed environment without model/pipeline + parallelism, you can skip the model parallel initialization and destruction + steps. +""" +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.distributed +from vllm.distributed.parallel_state import ( +GroupCoordinator, +_split_tensor_dict, +TensorMetadata, +) + +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + + +logger = init_logger(__name__) + + +def vllm__distributed__GroupCoordinator__send_tensor_dict( + self, + tensor_dict: Dict[str, Union[torch.Tensor, Any]], + dst: Optional[int] = None, + all_gather_group: Optional["GroupCoordinator"] = None, +) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """Send the input tensor dictionary. + NOTE: `dst` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + + all_gather_size = (1 if all_gather_group is None else + all_gather_group.world_size) + all_gather_rank = (0 if all_gather_group is None else + all_gather_group.rank_in_group) + + group = self.device_group + metadata_group = self.cpu_group + + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + """ + ============================= + Modifies by vllm_mlu + ============================= + @brief: Skip send tensor metadata list. + """ + assert isinstance( + tensor_dict, + dict), f"Expecting a dictionary, got {type(tensor_dict)}" + _, tensor_list = _split_tensor_dict(tensor_dict) + """ + ============================= + End of MLU Hijack + ============================= + """ + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip sending empty tensors. + continue + + # send-allgather: send only a slice, then do allgather. + if (all_gather_group is not None + and tensor.numel() % all_gather_size == 0): + tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] + + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.send(tensor, + dst=self.ranks[dst], + group=metadata_group) + else: + """ + ============================= + Modifies by vllm_mlu + ============================= + @brief: Modify send to isend. + """ + # use group for GPU tensors + torch.distributed.isend(tensor, + dst=self.ranks[dst], + group=group) + """ + ============================= + End of MLU Hijack + ============================= + """ + + return None + +""" +============================= +Modifies by vllm_mlu +============================= +@brief: Add a parameter `recv_metadata_list`. +""" +def vllm__distributed__GroupCoordinator__recv_tensor_dict( + self, + src: Optional[int] = None, + all_gather_group: Optional["GroupCoordinator"] = None, + recv_metadata_list: List[Tuple[str, Any]] = [], +) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """ + ============================= + End of MLU Hijack + ============================= + """ + """Recv the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return None + + all_gather_size = (1 if all_gather_group is None else + all_gather_group.world_size) + all_gather_rank = (0 if all_gather_group is None else + all_gather_group.rank_in_group) + + group = self.device_group + metadata_group = self.cpu_group + + if src is None: + src = (self.rank_in_group - 1) % self.world_size + assert src < self.world_size, f"Invalid src rank ({src})" + + """ + ============================= + Modifies by vllm_mlu + ============================= + @brief: Skip receiving tensor metadata list. + """ + """ + ============================= + End of MLU Hijack + ============================= + """ + tensor_dict: Dict[str, Any] = {} + for key, value in recv_metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + + # send-allgather: send only a slice, then do allgather. + use_all_gather = (all_gather_group is not None + and tensor.numel() % all_gather_size == 0) + + if use_all_gather: + orig_shape = tensor.shape + tensor = tensor.reshape(all_gather_size, + -1)[all_gather_rank] + + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.recv(tensor, + src=self.ranks[src], + group=metadata_group) + else: + """ + ============================= + Modifies by vllm_mlu + ============================= + @brief: Modify recv to irecv, and wait to finish. + """ + # use group for GPU tensors + req = torch.distributed.irecv(tensor, + src=self.ranks[src], + group=group) + req.wait() + """ + ============================= + End of MLU Hijack + ============================= + """ + if use_all_gather: + # do the allgather + tensor = all_gather_group.all_gather( # type: ignore + tensor, dim=0) + tensor = tensor.reshape(orig_shape) + + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + return tensor_dict + +MluHijackObject.apply_hijack( + GroupCoordinator, + GroupCoordinator.send_tensor_dict, + vllm__distributed__GroupCoordinator__send_tensor_dict, +) +MluHijackObject.apply_hijack( + GroupCoordinator, + GroupCoordinator.recv_tensor_dict, + vllm__distributed__GroupCoordinator__recv_tensor_dict, +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/engine/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/engine/__init__.py new file mode 100644 index 0000000..080f1e5 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/engine/__init__.py @@ -0,0 +1 @@ +from . import async_llm_engine diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/engine/async_llm_engine.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/engine/async_llm_engine.py new file mode 100644 index 0000000..c73782b --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/engine/async_llm_engine.py @@ -0,0 +1,310 @@ +import asyncio +from typing import (List, Optional, Union) + +from vllm.envs import VLLM_ENGINE_ITERATION_TIMEOUT_S as ENGINE_ITERATION_TIMEOUT_S +from vllm.core.scheduler import ScheduledSequenceGroup +from vllm.engine.async_timeout import asyncio_timeout +from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.sequence import ExecuteModelRequest, SequenceGroup, SequenceGroupMetadata +from vllm.engine.async_llm_engine import (_AsyncLLMEngine, AsyncLLMEngine) +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.engine.llm_engine import LLMEngine + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + + +def vllm__engine__async_llm_engine___AsyncLLMEngine____init__(self, *args, **kwargs): + LLMEngine.__init__(self, *args, **kwargs) + + """ + ============================= + Modifies by vllm_mlu + ============================= + @brief: Add a member variable to record parallel chunked prefill tasks, + in which each member means (virtual_engine -> {req_id: task_list}) + """ + self.step_tasks = [dict() for _ in range(len(self.scheduler))] + """ + ============================= + End of MLU Hijack + ============================= + """ + +def _update_scheduler_status( + self, + scheduled_seq_groups: List[ScheduledSequenceGroup], + ignored_seq_groups: List[SequenceGroup], + seq_group_metadata_list: List[SequenceGroupMetadata] +) -> None: + """Update scheduler status after emitting prefill task. + + For chunked pipeline parallel, since chunked prefill tasks + are executed asynchronously, we update scheduler status once + tasks are emited. + """ + # Update the scheduled sequence groups. + for scheduled_seq_group, seq_group_meta in zip( + scheduled_seq_groups, seq_group_metadata_list): + seq_group = scheduled_seq_group.seq_group + seq_group.update_num_computed_tokens( + scheduled_seq_group.token_chunk_size) + + # Free the finished sequence groups. + for scheduler in self.scheduler: + scheduler.free_finished_seq_groups() + +async def vllm__engine__async_llm_engine___AsyncLLMEngine__step_async( + self, virtual_engine: int +) -> Optional[List[Union[RequestOutput, EmbeddingRequestOutput]]]: + """Performs one decoding iteration and returns newly generated results. + The workers are ran asynchronously if possible. + + This function performs one decoding iteration of the engine. It first + schedules the sequences to be executed in the next iteration and the + token blocks to be swapped in/out/copy. Then, it executes the model + and updates the scheduler with the model outputs. Finally, it decodes + the sequences and returns the newly generated results. + """ + # these are cached outputs from previous iterations. None if on first + # iteration + cached_outputs = self.cached_scheduler_outputs[virtual_engine] + seq_group_metadata_list = cached_outputs.seq_group_metadata_list + scheduler_outputs = cached_outputs.scheduler_outputs + allow_async_output_proc = cached_outputs.allow_async_output_proc + + ctx = self.scheduler_contexts[virtual_engine] + + # Clear outputs for each new scheduler iteration + ctx.request_outputs.clear() + + # skip the scheduler if there are any remaining steps in the seq groups. + # This ensures that the scheduler is only called again when the current + # batch has completed. + if not self._has_remaining_steps(seq_group_metadata_list): + + # Schedule iteration + (seq_group_metadata_list, scheduler_outputs, + allow_async_output_proc + ) = self.scheduler[virtual_engine].schedule() + + ctx.seq_group_metadata_list = seq_group_metadata_list + ctx.scheduler_outputs = scheduler_outputs + + # Maybe switch from async mode to sync mode + if not allow_async_output_proc and len(ctx.output_queue) > 0: + self._process_model_outputs(ctx=ctx) + + if (self.scheduler_config.is_multi_step + and scheduler_outputs.num_lookahead_slots > 0): + # cache the scheduler outputs for the next iteration if we have + # lookahead slots + self._cache_scheduler_outputs_for_multi_step( + virtual_engine, seq_group_metadata_list, scheduler_outputs, + allow_async_output_proc) + + assert seq_group_metadata_list is not None + assert scheduler_outputs is not None + + if not scheduler_outputs.is_empty(): + finished_requests_ids = self.scheduler[ + virtual_engine].get_and_reset_finished_requests_ids() + + # Check if we have a cached last_output from the previous iteration. + # For supporting PP this is probably the best way to pass the + # sampled_token_ids, as a separate broadcast over all the PP stages + # will cause one virtual engine's microbatch to block the pipeline. + last_sampled_token_ids = \ + self._get_last_sampled_token_ids(virtual_engine) + + execute_model_req = ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, + blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, + blocks_to_copy=scheduler_outputs.blocks_to_copy, + virtual_engine=virtual_engine, + num_lookahead_slots=scheduler_outputs.num_lookahead_slots, + running_queue_size=scheduler_outputs.running_queue_size, + finished_requests_ids=finished_requests_ids, + # We use ExecuteModelRequest to pass the last sampled_token_ids + # to each of the non-last PP stages for in-place prepare_input. + last_sampled_token_ids=last_sampled_token_ids) + + if allow_async_output_proc: + execute_model_req.async_callback = self.async_callbacks[ + virtual_engine] + + # Execute the model. + """ + ============================= + Modifies by vllm_mlu + ============================= + @brief: for chunked prefill tasks except the final task for a single + request, create them asynchronously. And for the last prefill task, + gather all previous tasks and get the final output. + """ + if seq_group_metadata_list[0].is_prompt: + assert len(seq_group_metadata_list) == 1, \ + "Currently we only support schedule single batch in " \ + "prefill stage for chunked pipeline parallel." + token_chunk_size = seq_group_metadata_list[0].token_chunk_size + seq_data = list(seq_group_metadata_list[0].seq_data.values())[0] + prefill_loc = seq_data.get_num_computed_tokens() + task = asyncio.create_task( + self.model_executor.execute_model_async(execute_model_req, [prefill_loc], [token_chunk_size])) + request_id = seq_group_metadata_list[0].request_id + self.step_tasks[virtual_engine].setdefault(request_id, []).append(task) + + # Gather point: if all prefill tasks for current sequence group + # have been dispatched, we wait all prompt tasks and get the + # final output. + seq_len = seq_data.get_len() + if token_chunk_size + prefill_loc == seq_len: + outputs = await asyncio.gather(*self.step_tasks[virtual_engine][request_id]) + outputs = outputs[-1] + else: + # Since prefill stage has not been completely finished, we + # just update scheduler and sequence status and return None. + _update_scheduler_status(self, scheduler_outputs.scheduled_seq_groups, + scheduler_outputs.ignored_seq_groups, seq_group_metadata_list) + return None + else: + """ + ============================= + End of MLU Hijack + ============================= + """ + outputs = await self.model_executor.execute_model_async( + execute_model_req) + + # we need to do this here so that last step's sampled_token_ids can + # be passed to the next iteration for PP. + if self.scheduler_config.is_multi_step: + self._update_cached_scheduler_output(virtual_engine, outputs) + else: + if len(ctx.output_queue) > 0: + self._process_model_outputs(ctx=ctx) + outputs = [] + + # Finish the current step for all the sequence groups. + if self.scheduler_config.is_multi_step: + for seq_group in seq_group_metadata_list: + seq_group.finish_step() + + if not self._has_remaining_steps(seq_group_metadata_list): + # Clear the cache if we have finished all the steps + if self.scheduler_config.is_multi_step: + self.cached_scheduler_outputs[ + virtual_engine] = SchedulerOutputState() + + # is_first_step_output is True only when the num_steps of all + # the sequences are 1. When the num_steps > 1, + # multi_step_model_runner does the first-step output append. + is_first_step_output: bool = False if not seq_group_metadata_list \ + else seq_group_metadata_list[0].state.num_steps == 1 + + ctx.append_output(outputs=outputs, + seq_group_metadata_list=seq_group_metadata_list, + scheduler_outputs=scheduler_outputs, + is_async=allow_async_output_proc, + is_last_step=True, + is_first_step_output=is_first_step_output) + + if outputs and allow_async_output_proc: + assert len( + outputs + ) == 1, "Async postprocessor expects only a single output set" + self._advance_to_next_step( + outputs[0], seq_group_metadata_list, + scheduler_outputs.scheduled_seq_groups) + + if not allow_async_output_proc: + self._process_model_outputs(ctx=ctx) + + # Log stats. + self.do_log_stats(scheduler_outputs, outputs) + + # Tracing + self.do_tracing(scheduler_outputs) + + else: + # Multi-step case + return ctx.request_outputs + + if not self.has_unfinished_requests(): + # Drain async postprocessor (if exists) + if len(ctx.output_queue) > 0: + self._process_model_outputs(ctx=ctx) + assert len(ctx.output_queue) == 0 + + return ctx.request_outputs + +async def vllm__engine__async_llm_engine__AsyncLLMEngine__engine_step( + self, virtual_engine: int +) -> bool: + """Kick the engine to process the waiting requests. + + Returns True if there are in-progress requests.""" + + new_requests, aborted_requests = ( + self._request_tracker.get_new_and_aborted_requests()) + + for new_request in new_requests: + # Add the request into the vLLM engine's waiting queue. + try: + await self.engine.add_request_async(**new_request) + except ValueError as e: + # TODO: use a vLLM specific error for failed validation + self._request_tracker.process_exception( + new_request["request_id"], + e, + verbose=self.log_requests, + ) + + if aborted_requests: + await self._engine_abort(aborted_requests) + + request_outputs = await self.engine.step_async(virtual_engine) + + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + When request_outputs is None, it means prefill tasks are not finished. + """ + if request_outputs is None: + return True + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + # Put the outputs into the corresponding streams. + # If used as a callback, then already invoked inside + # LLMEngine's _process_model_outputs + if not self.use_process_request_outputs_callback: + all_finished = self.process_request_outputs(request_outputs) + else: + # For callback case, we only need to detect when all + # requests are finished + all_finished = all(request_output.finished + for request_output in request_outputs) + + return not all_finished + +MluHijackObject.apply_hijack( + _AsyncLLMEngine, + _AsyncLLMEngine.__init__, + vllm__engine__async_llm_engine___AsyncLLMEngine____init__ +) +MluHijackObject.apply_hijack( + _AsyncLLMEngine, + _AsyncLLMEngine.step_async, + vllm__engine__async_llm_engine___AsyncLLMEngine__step_async +) +MluHijackObject.apply_hijack( + AsyncLLMEngine, + AsyncLLMEngine.engine_step, + vllm__engine__async_llm_engine__AsyncLLMEngine__engine_step +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/__init__.py new file mode 100644 index 0000000..3f6da60 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/__init__.py @@ -0,0 +1,3 @@ +from . import distributed_gpu_executor +from . import distributed_mlu_executor +from . import ray_mlu_executor diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/distributed_gpu_executor.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/distributed_gpu_executor.py new file mode 100644 index 0000000..0850175 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/distributed_gpu_executor.py @@ -0,0 +1,75 @@ +import asyncio +from abc import abstractmethod +from typing import List, Optional + +from vllm.executor.distributed_gpu_executor import DistributedGPUExecutorAsync +from vllm.sequence import ExecuteModelRequest +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + +''' +============================= +Modify by vllm_mlu +============================= +@brief: Add two parameters, in which prefill_locs indicates the start location +and token_chunk_sizes indicates the chunk size for each task. +''' +async def vllm__executor__distributed_gpu_executor__DistributedGPUExecutorAsync__execute_model_async( + self, + execute_model_req: ExecuteModelRequest, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> List[SamplerOutput]: + ''' + ================== + End of MLU Hijack + ================== + ''' + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req, prefill_locs, token_chunk_sizes) + +''' +============================= +Modify by vllm_mlu +============================= +@brief: Add two parameters, in which prefill_locs indicates the start location +and token_chunk_sizes indicates the chunk size for each task. +''' +@abstractmethod +async def vllm__executor__distributed_gpu_executor__DistributedGPUExecutorAsync___driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> List[SamplerOutput]: + ''' + ================== + End of MLU Hijack + ================== + ''' + """Execute the model asynchronously in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + raise NotImplementedError + +MluHijackObject.apply_hijack( + DistributedGPUExecutorAsync, + DistributedGPUExecutorAsync.execute_model_async, + vllm__executor__distributed_gpu_executor__DistributedGPUExecutorAsync__execute_model_async +) +MluHijackObject.apply_hijack( + DistributedGPUExecutorAsync, + DistributedGPUExecutorAsync._driver_execute_model_async, + vllm__executor__distributed_gpu_executor__DistributedGPUExecutorAsync___driver_execute_model_async +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/distributed_mlu_executor.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/distributed_mlu_executor.py new file mode 100644 index 0000000..b8e835a --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/distributed_mlu_executor.py @@ -0,0 +1,75 @@ +import asyncio +from abc import abstractmethod +from typing import List, Optional + +from vllm.executor.distributed_mlu_executor import DistributedMLUExecutorAsync +from vllm.sequence import ExecuteModelRequest +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + +''' +============================= +Modify by vllm_mlu +============================= +@brief: Add two parameters, in which prefill_locs indicates the start location +and token_chunk_sizes indicates the chunk size for each task. +''' +async def vllm__executor__distributed_mlu_executor__DistributedMLUExecutorAsync__execute_model_async( + self, + execute_model_req: ExecuteModelRequest, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> List[SamplerOutput]: + ''' + ================== + End of MLU Hijack + ================== + ''' + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req, prefill_locs, token_chunk_sizes) + +''' +============================= +Modify by vllm_mlu +============================= +@brief: Add two parameters, in which prefill_locs indicates the start location +and token_chunk_sizes indicates the chunk size for each task. +''' +@abstractmethod +async def vllm__executor__distributed_mlu_executor__DistributedMLUExecutorAsync___driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> List[SamplerOutput]: + ''' + ================== + End of MLU Hijack + ================== + ''' + """Execute the model asynchronously in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + raise NotImplementedError + +MluHijackObject.apply_hijack( + DistributedMLUExecutorAsync, + DistributedMLUExecutorAsync.execute_model_async, + vllm__executor__distributed_mlu_executor__DistributedMLUExecutorAsync__execute_model_async +) +MluHijackObject.apply_hijack( + DistributedMLUExecutorAsync, + DistributedMLUExecutorAsync._driver_execute_model_async, + vllm__executor__distributed_mlu_executor__DistributedMLUExecutorAsync___driver_execute_model_async +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/ray_mlu_executor.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/ray_mlu_executor.py new file mode 100644 index 0000000..035d87c --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/executor/ray_mlu_executor.py @@ -0,0 +1,175 @@ +import asyncio +from typing import List, Optional + +from vllm.executor.distributed_mlu_executor import DistributedMLUExecutorAsync +from vllm.executor.ray_mlu_executor import RayMLUExecutorAsync +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger +from ..lock_utils import (_run_task_with_priority_lock, PriorityLock) + +logger = init_logger(__name__) + +vllm__executor__ray_mlu_executor__RayMLUExecutorAsync____init____org = RayMLUExecutorAsync.__init__ + +def vllm__executor__ray_mlu_executor__RayMLUExecutorAsync____init__(self, *args, **kwargs): + vllm__executor__ray_mlu_executor__RayMLUExecutorAsync____init____org(self, *args, **kwargs) + + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + For the prefill stage of a request in chunked pipeline parallel, tasks + in the same pp_rank must be executed in order. Here, we use priority lock + to implement this function. + To ensure different requests executed in order, we will reserve a certain + priority interval for each request. And the interval length is + `max_model_len`, which is no less than the model execution rounds. + And for each execution round, the priority is: + `request_id * max_model_len + model_execution_time` + """ + self.priority = dict() + self.priority_interval = self.model_config.max_model_len + # To ensure pp tasks for the same prefill tokens are created atomically, we + # use an extra lock to guard it. + self.lock = asyncio.Lock() + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + +''' +============================= +Modify by vllm_mlu +============================= +@brief: Add two parameters, in which prefill_locs indicates the start location +and token_chunk_sizes indicates the chunk size for each task. +''' +async def vllm__executor__ray_mlu_executor__RayMLUExecutorAsync__execute_model_async( + self, + execute_model_req: ExecuteModelRequest, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> List[SamplerOutput]: + ''' + ================== + End of MLU Hijack + ================== + ''' + assert not self.use_ray_spmd_worker, ( + "RayMLUExecutorAsync is not supported for spmd mode.") + return await DistributedMLUExecutorAsync.execute_model_async( + self, execute_model_req, prefill_locs, token_chunk_sizes) + +''' +============================= +Modify by vllm_mlu +============================= +@brief: Add two parameters, in which prefill_locs indicates the start location +and token_chunk_sizes indicates the chunk size for each task. +''' +async def vllm__executor__ray_mlu_executor__RayMLUExecutorAsync___driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> List[SamplerOutput]: + ''' + ================== + End of MLU Hijack + ================== + ''' + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") + if not self.tp_driver_workers: + return await self.driver_exec_method( + "execute_model", execute_model_req, prefill_locs, token_chunk_sizes) + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + Use PriorityLock instead of lock to ensure that tasks in the same pp rank + are executed with the dispatched order. + """ + request_id = 'dummy' + update_priority_threshold = False + is_prompt = False + if execute_model_req is not None: + assert len(execute_model_req.seq_group_metadata_list) == 1, \ + "Only single batch is supported for chunked pipeline parallel mode." + request_id = execute_model_req.seq_group_metadata_list[0].request_id + seq_group_metadata = execute_model_req.seq_group_metadata_list[0] + request_priority = self.priority.setdefault( + request_id, len(self.priority)*self.model_config.max_model_len) + seq_data = list(seq_group_metadata.seq_data.values())[0] + seq_len = seq_data.get_len() + + # Update priority threshold to schedule next request. + is_prompt = seq_group_metadata.is_prompt + if is_prompt and seq_len == prefill_locs[0] + token_chunk_sizes[0]: + update_priority_threshold = True + else: + request_priority = -1 + + if self.pp_locks is None: + # This locks each pipeline parallel stage so multiple virtual + # engines can't execute on the same stage at the same time + # We create the locks here to avoid creating them in the constructor + # which uses a different asyncio loop. + self.pp_locks = [ + PriorityLock(init_priority_threshold=self.model_config.max_model_len, + priority_interval=self.priority_interval) + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + + async with self.lock: + tasks = [ + asyncio.create_task( + _run_task_with_priority_lock( + self.driver_exec_method, self.pp_locks[0], request_priority, + update_priority_threshold, + "execute_model", execute_model_req, prefill_locs, token_chunk_sizes, + request_priority)) + ] + for pp_rank, driver_worker in enumerate(self.tp_driver_workers, + start=1): + tasks.append( + asyncio.create_task( + _run_task_with_priority_lock( + driver_worker.execute_method.remote, + self.pp_locks[pp_rank], request_priority, + update_priority_threshold, + "execute_model", execute_model_req, prefill_locs, token_chunk_sizes, + request_priority))) + if execute_model_req is not None: + self.priority[request_id] += (token_chunk_sizes[0] if is_prompt else 1) + + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + + results = await asyncio.gather(*tasks) + + # Only the last PP stage has the final results. + return results[-1] + +MluHijackObject.apply_hijack( + RayMLUExecutorAsync, + RayMLUExecutorAsync.__init__, + vllm__executor__ray_mlu_executor__RayMLUExecutorAsync____init__ +) +MluHijackObject.apply_hijack( + RayMLUExecutorAsync, + RayMLUExecutorAsync.execute_model_async, + vllm__executor__ray_mlu_executor__RayMLUExecutorAsync__execute_model_async +) +MluHijackObject.apply_hijack( + RayMLUExecutorAsync, + RayMLUExecutorAsync._driver_execute_model_async, + vllm__executor__ray_mlu_executor__RayMLUExecutorAsync___driver_execute_model_async +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/lock_utils.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/lock_utils.py new file mode 100644 index 0000000..519fe44 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/lock_utils.py @@ -0,0 +1,218 @@ +import asyncio +from typing import Callable + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + + +class PriorityLock: + """ + A lock class that prioritizes tasks based on their priority level and supports dynamic + updating of priority thresholds after each lock release. + + Attributes: + ----------- + _lock : asyncio.Lock + An internal asyncio lock used to ensure mutual exclusion. + _queue : asyncio.PriorityQueue + A priority queue to store tasks by their priority. Tasks with lower numerical priority + values have higher priority. + _condition : asyncio.Condition + A condition variable to manage the waiting and notification of tasks. + _active_task : asyncio.Task or None + Tracks the task currently holding the lock, or None if the lock is not held. + _current_priority_threshold : int + The current priority threshold for tasks allowed to acquire the lock. + _priority_interval : int + The value by which the priority threshold is incremented after a lock release when + `update_priority_threshold` is enabled. + """ + + def __init__(self, init_priority_threshold: int, priority_interval: int): + """ + Initializes a PriorityLock with an initial priority threshold and interval. + + Parameters: + ----------- + init_priority_threshold : int + The initial threshold for task priorities that can acquire the lock. + priority_interval : int + The interval by which the priority threshold increases after each lock release. + """ + self._lock = asyncio.Lock() # Internal asyncio lock + self._queue = asyncio.PriorityQueue() # Priority queue to manage tasks by priority + self._condition = asyncio.Condition() # Condition variable to manage waiting tasks + self._active_task = None # Keep track of the current active task holding the lock + self._current_priority_threshold = init_priority_threshold + self._priority_interval = priority_interval + + async def acquire(self, priority): + """ + Acquires the lock for a task based on its priority. + + Parameters: + ----------- + priority : int + The priority level of the task attempting to acquire the lock. + + Behavior: + --------- + - The task is enqueued based on its priority. + - The task waits until it is the highest-priority task in the queue, has a priority + below the current threshold, and the lock is available. + """ + queue_item = (priority, asyncio.current_task()) + async with self._condition: + await self._queue.put(queue_item) + + # Wait until the current task is the one with the highest priority and the lock is available + while True: + # Check if the current task is at the front of the queue and the lock is available + current_priority, current_task = self._queue._queue[0] # Peek at the highest priority task + if current_priority < self._current_priority_threshold and current_task is asyncio.current_task() and not self._lock.locked(): + await self._lock.acquire() # Acquire the lock + self._active_task = current_task # Mark the current task as holding the lock + await self._queue.get() # Remove the task from the queue + break + # If not the highest priority task, wait until notified + await self._condition.wait() + + async def release(self, update_priority_threshold): + """ + Releases the lock, optionally updating the priority threshold. + + Parameters: + ----------- + update_priority_threshold : bool + If True, increments the priority threshold by the configured interval. + """ + # Notify waiting tasks that the lock has been released + async with self._condition: + self._active_task = None # Clear the reference to the current task + self._lock.release() + + if update_priority_threshold: + self._current_priority_threshold += self._priority_interval + self._condition.notify_all() # Wake up all waiting tasks to recheck their priority + + async def __aenter__(self, priority): + """ + Async context manager entry. Acquires the lock with the specified priority. + + Parameters: + ----------- + priority : int + The priority level of the task acquiring the lock. + + Returns: + -------- + self : PriorityLock + The lock instance. + """ + await self.acquire(priority) + return self + + async def __aexit__(self, exc_type, exc, tb, update_priority_threshold): + """ + Async context manager exit. Releases the lock and optionally updates the priority threshold. + + Parameters: + ----------- + exc_type : Exception or None + The exception type, if any, raised in the 'async with' block. + exc : Exception or None + The exception instance, if any, raised in the 'async with' block. + tb : traceback or None + The traceback object, if any, associated with the exception. + update_priority_threshold : bool + If True, increments the priority threshold after releasing the lock. + """ + await self.release(update_priority_threshold) # Now release is async + + +class PriorityLockManager: + """ + A helper class to manage the acquisition and release of a PriorityLock using an 'async with' block. + + Attributes: + ----------- + _lock : PriorityLock + The PriorityLock instance to be managed. + _priority : int + The priority level for the current task. + _update_priority_threshold : bool + Whether to update the priority threshold after the lock is released. + """ + + def __init__(self, lock, priority, update_priority_threshold): + """ + Initializes a PriorityLockManager with a PriorityLock and task-specific parameters. + + Parameters: + ----------- + lock : PriorityLock + The lock instance to manage. + priority : int + The priority level for the current task. + update_priority_threshold : bool + Whether to update the priority threshold after releasing the lock. + """ + self._lock = lock # The lock being managed + self._priority = priority # The priority level for the current task + self._update_priority_threshold = update_priority_threshold + + async def __aenter__(self): + """ + Async context manager entry. Acquires the lock with the specified priority. + + Returns: + -------- + lock : PriorityLock + The lock instance that was acquired. + """ + await self._lock.acquire(self._priority) # Acquire the lock with priority + return self._lock + + async def __aexit__(self, exc_type, exc, tb): + """ + Async context manager exit. Releases the lock and optionally updates the priority threshold. + + Parameters: + ----------- + exc_type : Exception or None + The exception type, if any, raised in the 'async with' block. + exc : Exception or None + The exception instance, if any, raised in the 'async with' block. + tb : traceback or None + The traceback object, if any, associated with the exception. + """ + await self._lock.__aexit__(exc_type, exc, tb, self._update_priority_threshold) # Release the lock + + +async def _run_task_with_priority_lock( + task: Callable, lock: asyncio.Lock, priority: int, + update_priority_threshold: bool, *args, **kwargs): + """ + Runs a task within the context of a PriorityLock, ensuring proper acquisition and release. + + Parameters: + ----------- + task : Callable + The async function representing the task to be executed. + lock : PriorityLock + The PriorityLock instance managing access. + priority : int + The priority level for the task. + update_priority_threshold : bool + Whether to update the priority threshold after releasing the lock. + *args, **kwargs: + Additional arguments to pass to the task function. + + Returns: + -------- + result : Any + The result of the task execution. + """ + async with PriorityLockManager(lock, priority, update_priority_threshold): # Acquire the lock based on priority + return await task(*args, **kwargs) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/mlu_hijack.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/mlu_hijack.py new file mode 100644 index 0000000..d8480b4 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/mlu_hijack.py @@ -0,0 +1,14 @@ +from vllm_mlu._mlu_utils import * +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + + +logger = init_logger(__name__) + +from . import distributed +from . import engine +from . import executor +from . import model_executor +from . import worker + +logger.info("Apply Chunked Pipeline Parallel Demo!") \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/__init__.py new file mode 100644 index 0000000..edc0404 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/__init__.py @@ -0,0 +1,2 @@ +# hijack vllm models +from .models import custom, llama diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/custom.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/custom.py new file mode 100644 index 0000000..4e7290a --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/custom.py @@ -0,0 +1,25 @@ +from typing import Any, List, Tuple + +import torch + +from vllm.distributed.parallel_state import TensorMetadata +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm_mlu.model_executor.custom_model.custom import CustomForCausalLM + +def vllm__module_executor__models__custom_model__CustomForCausalLM__get_intermediate_tensor_metadata( + self, + batch_size: int, + dtype: torch.dtype, + device: torch.device, +) -> List[Tuple[str, Any]]: + metadata_list: List[Tuple[str, Any]] = [] + size = torch.Size([batch_size, self.config.hidden_size]) + metadata_list.append(("hidden_states", TensorMetadata(device.type, dtype, size))) + metadata_list.append(("residual", None)) + return metadata_list + +MluHijackObject.apply_hijack( + CustomForCausalLM, + "get_intermediate_tensor_metadata", + vllm__module_executor__models__custom_model__CustomForCausalLM__get_intermediate_tensor_metadata +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/llama.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/llama.py new file mode 100644 index 0000000..16e1b96 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/model_executor/models/llama.py @@ -0,0 +1,24 @@ +from typing import Any, List, Tuple + +import torch + +from vllm.distributed.parallel_state import TensorMetadata +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.model_executor.models.llama import LlamaForCausalLM + +def vllm__module_executor__models__llama__LlamaForCausalLM__get_intermediate_tensor_metadata( + self, + batch_size: int, + dtype: torch.dtype, + device: torch.device, +) -> List[Tuple[str, Any]]: + metadata_list: List[Tuple[str, Any]] = [] + size = torch.Size([batch_size, self.config.hidden_size]) + metadata_list.append(("hidden_states", TensorMetadata(device.type, dtype, size))) + return metadata_list + +MluHijackObject.apply_hijack( + LlamaForCausalLM, + "get_intermediate_tensor_metadata", + vllm__module_executor__models__llama__LlamaForCausalLM__get_intermediate_tensor_metadata +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/__init__.py new file mode 100644 index 0000000..ac4f715 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/__init__.py @@ -0,0 +1,3 @@ +from . import mlu_model_runner +from . import model_runner +from . import worker_base diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/mlu_model_runner.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/mlu_model_runner.py new file mode 100644 index 0000000..de36a1a --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/mlu_model_runner.py @@ -0,0 +1,176 @@ +import weakref +from typing import (List, Optional) + +import torch +import torch.distributed + +from vllm.compilation.compile_context import set_compile_context +from vllm.distributed import get_pp_group +from vllm.inputs import INPUT_REGISTRY +from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sampling_params import SamplingParams +from vllm.sequence import SequenceGroupMetadata +from vllm.worker.model_runner import ( + TModelInputForGPU, + LORA_WARMUP_RANK, + _BATCH_SIZES_TO_CAPTURE +) +from vllm.worker.mlu_model_runner import ( + MLUModelRunnerBase, + ModelInputForMLUBuilder +) +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + + +@torch.inference_mode() +def vllm__worker__mlu_model_runner__MLUModelRunnerBase__profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for multi-modal encoding, which + # needs to be accounted for when calculating the GPU blocks for + # vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) + if max_mm_tokens > 0: + max_num_seqs_orig = max_num_seqs + max_num_seqs = min(max_num_seqs, + max_num_batched_tokens // max_mm_tokens) + if max_num_seqs < 1: + expr = (f"min({max_num_seqs_orig}, " + f"{max_num_batched_tokens} // {max_mm_tokens})") + logger.warning( + "Computed max_num_seqs (%s) to be less than 1. " + "Setting it to the minimum value of 1.", expr) + max_num_seqs = 1 + + batch_size = 0 + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + batch_size += seq_len + + dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) + + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: dummy_data.seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data.multi_modal_placeholders, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: support kv cache int8 + ''' + kv_caches = [] + for _ in range(num_layers): + kv_cache_ = torch.tensor([], dtype=torch.float32, device=self.device) + kv_cache_scale_ = torch.tensor([], dtype=torch.float32, device=self.device) + kv_caches.append([kv_cache_, kv_cache_scale_]) + ''' + ================== + End of MLU Hijack + ================== + ''' + + finished_requests_ids = [seq.request_id for seq in seqs] + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + @brief: Add two parameters: prefill_loc and token_chunk_size. + """ + token_chunk_sizes = [seq.token_chunk_size for seq in seqs] + model_input = self.prepare_model_input( + seqs, + finished_requests_ids=finished_requests_ids, + prefill_locs=[0]*len(seqs), + token_chunk_sizes=token_chunk_sizes, + ) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = self.model.make_empty_intermediate_tensors( + batch_size=batch_size, + dtype=self.model_config.dtype, + device=self.device) + + graph_batch_size = self.max_batchsize_to_capture + batch_size_capture_list = [ + bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size + ] + if self.model_config.enforce_eager: + batch_size_capture_list = [] + with set_compile_context(batch_size_capture_list): + self.execute_model(model_input, kv_caches, intermediate_tensors) + torch.mlu.synchronize() + + return + + +MluHijackObject.apply_hijack( + MLUModelRunnerBase, + MLUModelRunnerBase.profile_run, + vllm__worker__mlu_model_runner__MLUModelRunnerBase__profile_run +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/model_runner.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/model_runner.py new file mode 100644 index 0000000..207f1cb --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/model_runner.py @@ -0,0 +1,304 @@ +import dataclasses +import weakref +from typing import (List, Optional, TypeVar) + +from vllm.distributed import get_pp_group +from vllm.model_executor import SamplingMetadata +from vllm.sequence import SequenceGroupMetadata +from vllm.worker.model_runner import ( + GPUModelRunnerBase, + ModelInputForGPUBuilder, + ModelInputForGPUWithSamplingMetadata, + ModelRunner, + TModelInputForGPU +) +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + + +""" +====================================== +Modified by Chunked Parallel Pipeline. +====================================== +@brief: Add two parameters, prefill_loc and token_chunk_size. +""" +def vllm__worker__model_runner__ModelInputForGPUBuilder___compute_lens( + self, inter_data: ModelInputForGPUBuilder.InterDataForSeqGroup, + seq_idx: int, seq_group_metadata: SequenceGroupMetadata, + prefill_loc: Optional[int] = None, + token_chunk_size: Optional[int] = None, +): + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + """Compute context length, sequence length and tokens + for the given sequence data. + """ + seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] + if token_chunk_size is None: + token_chunk_size = seq_group_metadata.token_chunk_size + + # Compute context length (the number of tokens that are + # already computed) and sequence length (total number of tokens). + + seq_len = seq_data.get_len() + if inter_data.is_prompt: + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + @brief: For chunked pipeline parallel, since multiple tasks + use the same sequence data with different prefill location, + an extra parameter is provided to indicate the prefill location. + """ + context_len = ( + prefill_loc if prefill_loc is not None + else seq_data.get_num_computed_tokens() + ) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + seq_len = min(seq_len, context_len + token_chunk_size) + elif self.runner.scheduler_config.is_multi_step or \ + self.runner.model_config.is_encoder_decoder: + assert prefill_loc is None, "Chunked Parallel Pipeline does not support multi-step." + context_len = seq_len - 1 + else: + context_len = seq_data.get_num_computed_tokens() + + # Compute tokens. + tokens = seq_data.get_token_ids()[context_len:seq_len] + + inter_data.seq_lens[seq_idx] = seq_len + inter_data.orig_seq_lens[seq_idx] = seq_len + inter_data.context_lens[seq_idx] = context_len + inter_data.input_tokens[seq_idx].extend(tokens) + inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) + inter_data.query_lens[seq_idx] = seq_len - context_len + + if seq_data.mrope_position_delta is not None: + if inter_data.mrope_input_positions is None: + inter_data.mrope_input_positions = [None] * inter_data.n_seqs + + inter_data.mrope_input_positions[ + seq_idx] = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, + context_len, + seq_len, + ) + + +""" +====================================== +Modified by Chunked Parallel Pipeline. +====================================== +@brief: Add two parameters, prefill_loc and token_chunk_size. +""" +def vllm__worker__model_runner__ModelInputForGPUBuilder__add_seq_group( + self, seq_group_metadata: SequenceGroupMetadata, + prefill_loc: Optional[int] = None, + token_chunk_size: Optional[int] = None, +): + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + """Add a sequence group to the builder.""" + seq_ids = seq_group_metadata.seq_data.keys() + n_seqs = len(seq_ids) + is_prompt = seq_group_metadata.is_prompt + + if is_prompt: + assert n_seqs == 1 + self.decode_only = False + + encoder_seq_len = 0 + + if self.runner.model_config.is_encoder_decoder: + encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() + + + inter_data = self.init_cached_inter_data( + request_id=seq_group_metadata.request_id, + seq_ids=seq_ids, + is_prompt=is_prompt, + block_tables=seq_group_metadata.block_tables, + computed_block_nums=seq_group_metadata.computed_block_nums, + reinit=True, + reinit_use_defaults=True, + encoder_seq_len=encoder_seq_len) + + self.inter_data_list.append(inter_data) + + for seq_idx in range(n_seqs): + for per_seq_fn in self.per_seq_compute_fns: + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + @brief: Add prefill location and token chunk size parameters. + """ + if per_seq_fn.__qualname__ == \ + "vllm__worker__model_runner__ModelInputForGPUBuilder___compute_lens": + per_seq_fn(inter_data, seq_idx, seq_group_metadata, prefill_loc, token_chunk_size) + else: + per_seq_fn(inter_data, seq_idx, seq_group_metadata) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + for per_seq_group_fn in self.per_seq_group_compute_fns: + per_seq_group_fn(inter_data, seq_group_metadata) + + +def vllm__worker__model_runner__GPUModelRunnerBase___prepare_model_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + finished_requests_ids: Optional[List[str]] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> TModelInputForGPU: + """Helper method to prepare the model input based on a given sequence + group. Prepares metadata needed for the base model forward pass but not + metadata for possible additional steps, e.g., sampling. + + The API assumes seq_group_metedata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + + If cuda graph is required, this API automatically pads inputs. + """ + builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + @brief: Add two parameters: prefill_loc and token_chunk_size, and + check whether they are same as sequence group length or empty. + """ + if prefill_locs is None: + prefill_locs = [None] * len(seq_group_metadata_list) + + assert len(prefill_locs) == len(seq_group_metadata_list), \ + "the lengths of prefill locs and seq_group_metadata are different." + + if token_chunk_sizes is None: + token_chunk_sizes = [None] * len(seq_group_metadata_list) + + assert len(token_chunk_sizes) == len(seq_group_metadata_list), \ + "the lengths of token_chunk_sizes and seq_group_metadata are different." + + for seq_group_metadata, prefill_loc, token_chunk_size in zip( + seq_group_metadata_list, prefill_locs, token_chunk_sizes + ): + builder.add_seq_group(seq_group_metadata, prefill_loc, token_chunk_size) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + + builder.reset_cached_inter_data() + + return builder.build() # type: ignore + +""" +====================================== +Modified by Chunked Parallel Pipeline. +====================================== +@brief: Add two parameters, prefill_loc and token_chunk_size. +""" +def vllm__worker__model_runner__ModelRunner__prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[List[int]] = None, +) -> ModelInputForGPUWithSamplingMetadata: + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + + If cuda graph is required, this API automatically pads inputs. + """ + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + Add prefill location parameter. + """ + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, + finished_requests_ids, + prefill_locs, + token_chunk_sizes) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + if get_pp_group().is_last_rank: + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, model_input.seq_lens, + model_input.query_lens, self.device, self.pin_memory, + generators, self.sampling_metadata_cache) + else: + sampling_metadata = None + is_prompt = (seq_group_metadata_list[0].is_prompt + if seq_group_metadata_list else None) + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt, + virtual_engine=virtual_engine) + +MluHijackObject.apply_hijack( + ModelInputForGPUBuilder, + ModelInputForGPUBuilder._compute_lens, + vllm__worker__model_runner__ModelInputForGPUBuilder___compute_lens +) + +MluHijackObject.apply_hijack( + ModelInputForGPUBuilder, + ModelInputForGPUBuilder.add_seq_group, + vllm__worker__model_runner__ModelInputForGPUBuilder__add_seq_group +) + +MluHijackObject.apply_hijack( + GPUModelRunnerBase, + GPUModelRunnerBase._prepare_model_input_tensors, + vllm__worker__model_runner__GPUModelRunnerBase___prepare_model_input_tensors +) + +MluHijackObject.apply_hijack( + ModelRunner, + ModelRunner.prepare_model_input, + vllm__worker__model_runner__ModelRunner__prepare_model_input +) \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/worker_base.py b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/worker_base.py new file mode 100644 index 0000000..91fef27 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/chunked_pipeline_parallel/mlu_hijack/worker/worker_base.py @@ -0,0 +1,219 @@ +import dataclasses +import importlib +import os +import time +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union + +import torch + +from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform +from vllm.sequence import (ExecuteModelRequest, IntermediateTensors) +from vllm.utils import (enable_trace_function_call_for_thread, + update_environment_variables) +from vllm.worker.model_runner_base import (BroadcastableModelInput, + ModelRunnerBase, + ModelRunnerInputBase) +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, + WorkerInput, + extract_previous_hidden_states) +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger + +logger = init_logger(__name__) + + +def vllm__worker__worker_base__LocalOrDistributedWorkerBase___get_driver_input_and_broadcast( + self, execute_model_req: ExecuteModelRequest, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[int] = None, +) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: + """ Get the driver input and broadcast it to other workers. """ + assert self.is_driver_worker + + worker_input: WorkerInput = self.prepare_worker_input( + execute_model_req=execute_model_req) + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + Pass prefill location and chunk size parameters. + """ + model_input: ModelRunnerInputBase = ( + self.model_runner.prepare_model_input( + execute_model_req.seq_group_metadata_list, + execute_model_req.virtual_engine, + execute_model_req.finished_requests_ids, + prefill_locs, + token_chunk_sizes)) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + + kwargs = extract_previous_hidden_states(execute_model_req) + + if self.do_metadata_broadcast: + broadcast_data = worker_input.as_broadcastable_tensor_dict() + broadcast_data.update(model_input.as_broadcastable_tensor_dict()) + broadcast_data.update(kwargs) + broadcast_tensor_dict(broadcast_data, src=0) + + if execute_model_req.async_callback: + model_input = dataclasses.replace( # type: ignore + model_input, + async_callback=execute_model_req.async_callback) + + return model_input, worker_input, kwargs + +def vllm__worker__worker_base__LocalOrDistributedWorkerBase__prepare_input( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[int] = None, +) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ + str, torch.Tensor]]]: + """ + Prepare the inputs to ModelRunner and workers. + """ + if self.is_driver_worker: + if execute_model_req is None: + if self.do_metadata_broadcast: + # This signals that there's no more requests to process for + # now. All workers are running infinite loop with + # broadcast_tensor_dict, and it stops the loop when the + # driver broadcasts an empty input. Send an empty input to + # notify all other workers to stop their execution loop. + broadcast_tensor_dict({}, src=0) + return None + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + Pass prefill location and chunk size parameters. + """ + return self._get_driver_input_and_broadcast( + execute_model_req, prefill_locs, token_chunk_sizes) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + else: + return self._get_worker_input_from_broadcast() + +def vllm__worker__worker_base__LocalOrDistributedWorkerBase__execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + prefill_locs: Optional[List[int]] = None, + token_chunk_sizes: Optional[int] = None, + priority: int = -1, +) -> Optional[List[SamplerOutput]]: + """Executes at least one model step on the given sequences, unless no + sequences are provided.""" + start_time = time.perf_counter() + + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + Pass prefill location and chunk size parameters. + """ + inputs = self.prepare_input(execute_model_req, prefill_locs, token_chunk_sizes) + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + if inputs is None: + return None + + model_input, worker_input, kwargs = inputs + num_steps = worker_input.num_steps + + self.execute_worker(worker_input) + + # If there is no input, we don't need to execute the model. + if worker_input.num_seq_groups == 0: + return [] + + """ + ====================================== + Modified by Chunked Parallel Pipeline. + ====================================== + @brief: To prevent the execution of mlu pipeline interrupted by host communication, + cancel the host communication and prepare metadata list directly. + """ + assert (token_chunk_sizes is not None and len(token_chunk_sizes) == 1) + batch_size = token_chunk_sizes[0] + metadata_list = self.model_runner.model.get_intermediate_tensor_metadata( + batch_size, + dtype=self.model_runner.model_config.dtype, + device=self.model_runner.device) + + intermediate_tensors = None + orig_model_execute_time = 0.0 + if not get_pp_group().is_first_rank: + intermediate_tensors = IntermediateTensors( + get_pp_group().recv_tensor_dict( + all_gather_group=get_tp_group(), + recv_metadata_list=metadata_list)) + if (self.observability_config is not None + and self.observability_config.collect_model_execute_time): + orig_model_execute_time = intermediate_tensors.tensors.get( + "model_execute_time", torch.tensor(0)).item() + """ + ====================================== + End by Chunked Parallel Pipeline. + ====================================== + """ + + output = self.model_runner.execute_model( + model_input=model_input, + kv_caches=self.kv_cache[worker_input.virtual_engine] + if self.kv_cache is not None else None, + intermediate_tensors=intermediate_tensors, + num_steps=num_steps, + **kwargs, + ) + + model_execute_time = time.perf_counter() - start_time + if not get_pp_group().is_last_rank: + # output is IntermediateTensors + if (self.observability_config is not None + and self.observability_config.collect_model_execute_time): + output.tensors["model_execute_time"] = torch.tensor( + model_execute_time + orig_model_execute_time) + get_pp_group().send_tensor_dict(output.tensors, + all_gather_group=get_tp_group()) + return [None] + if (self.observability_config is not None + and self.observability_config.collect_model_execute_time + and output is not None): + for o in output: + o.model_execute_time = (orig_model_execute_time + + model_execute_time) + + # output is List[SamplerOutput] + return output + +MluHijackObject.apply_hijack( + LocalOrDistributedWorkerBase, + LocalOrDistributedWorkerBase.prepare_input, + vllm__worker__worker_base__LocalOrDistributedWorkerBase__prepare_input +) +MluHijackObject.apply_hijack( + LocalOrDistributedWorkerBase, + LocalOrDistributedWorkerBase._get_driver_input_and_broadcast, + vllm__worker__worker_base__LocalOrDistributedWorkerBase___get_driver_input_and_broadcast +) +MluHijackObject.apply_hijack( + LocalOrDistributedWorkerBase, + LocalOrDistributedWorkerBase.execute_model, + vllm__worker__worker_base__LocalOrDistributedWorkerBase__execute_model +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/README.md b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/README.md new file mode 100644 index 0000000..ba2d27f --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/README.md @@ -0,0 +1,27 @@ +### 简介 + +该example是vLLM中进行Context Parallel和Ring Attention的实验,mlu_hijack是对仓库代码的劫持,避免修改主仓库代码 + +### 支持模型 + +目前仅对LLaMA2系列模型进行了精度验证 + +### 支持板卡 + +暂不支持300系列设备 + +### 运行demo +```python +python examples/cambricon_custom_func/context_parallel/offline_inference.py +``` + +### 使用Context Parallel特性 + +设置环境变量export CONTEXT_PARALLEL_EN=1|True|true|TRUE, LLM主接口传入context_parallel_size参数 + +### 实现细节 + +- 为了使Ring Attention实现负载均衡,数据使用了zigzag的拆分方式 +- 需要的MLU卡数为world_size = context_parallel_size * tensor_parallel_size,先拆cp, 然后拆tp +- 目前只是用作实验验证,context阶段采用cp,decoder阶段只在一个cp group上进行 +- 支持kv cache int8量化 diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/benchmark_context_latency.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/benchmark_context_latency.py new file mode 100644 index 0000000..e091146 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/benchmark_context_latency.py @@ -0,0 +1,83 @@ +from vllm import LLM, SamplingParams +from vllm.transformers_utils.config import get_config +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +import argparse +import numpy as np +import time +import torch +from tqdm import tqdm +from typing import Optional + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--model", + type=str, + help="support /data/AE/llm/models/Llama-2-7b-hf/, \ + /data/AE/llm/models/Llama-2-13b-hf/, \ + /data/AE/llm/models/Llama-2-70b-hf/") + parser.add_argument('--input_len', type=int, default=4096) + parser.add_argument('--output_len', type=int, default=1) + parser.add_argument("--tensor_parallel_size", "-tp", type=int, help="tp") + parser.add_argument("--context_parallel_size", "-cp", type=int, help="cp") + parser.add_argument('--quantization', + '-q', + choices=[*QUANTIZATION_METHODS, None], + default=None) + parser.add_argument('--num_iters_warmup', + type=int, + default=3, + help='Number of iterations to run for warmup.') + parser.add_argument('--num_iters', + type=int, + default=10, + help='Number of iterations to run.') + parser.add_argument('--trust_remote_code', + action='store_true', + help='trust remote code from huggingface') + parser.add_argument('--latency', + action='store_true', + help='get context latency') + args = parser.parse_args() + + print("model: ", args.model) + print("seq_len: ", args.input_len) + print("tensor_parallel_size: ", args.tensor_parallel_size) + print("context_parallel_size: ", args.context_parallel_size) + + sampling_params = SamplingParams(temperature=0.8, max_tokens=args.output_len) + llm = LLM(model=args.model, enforce_eager=True, max_model_len = args.input_len, + max_num_batched_tokens = args.input_len, max_num_seqs = 1, + tensor_parallel_size = args.tensor_parallel_size, + context_parallel_size = args.context_parallel_size) + + np.random.seed(0) + dummy_prompt_token_ids = np.random.randint(10000, size=(1, args.input_len)) + dummy_prompt_token_ids = dummy_prompt_token_ids.tolist() + + if args.latency: + def run_to_completion(): + start_time = time.perf_counter() + llm.generate(prompt_token_ids=dummy_prompt_token_ids, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion() + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion()) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90] + percentiles = np.percentile(latencies, percentages) + print(f'Avg latency: {np.mean(latencies)} seconds') + for percentage, percentile in zip(percentages, percentiles): + print(f'{percentage}% percentile latency: {percentile} seconds') + llm.get_metrics(args.num_iters_warmup,False,args.input_len,args.output_len,args.tensor_parallel_size,args.quantization) + else: + outputs = llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params = sampling_params) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/__init__.py new file mode 100644 index 0000000..7b67f0f --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/__init__.py @@ -0,0 +1 @@ +from .backends import mlu_attn \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/backends/mlu_attn.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/backends/mlu_attn.py new file mode 100644 index 0000000..6331972 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/backends/mlu_attn.py @@ -0,0 +1,58 @@ +from typing import Optional, Type +import torch +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.backends.mlu_attn import MLUFlashAttentionMetadata +from vllm_mlu.attention.backends.mlu_attn import MLUFlashAttentionImpl_V2 + +from .ring_attn import zigzag_ring_attn +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_context_model_parallel_world_size) + + +vllm__attention__backends__flash_attn__MLUFlashAttentionImpl__forward_org = MLUFlashAttentionImpl_V2.forward + +def vllm__attention__backends__flash_attn__MLUFlashAttentionImpl__forward_wraper( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: MLUFlashAttentionMetadata, + k_scale: float = 1.0, + v_scale: float = 1.0, + attn_type: AttentionType = AttentionType.DECODER, + use_mla: bool = False, +) -> torch.Tensor: + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: use ring attn when context parallel + ''' + if get_context_model_parallel_world_size() > 1 and attn_metadata.prefill_metadata: + return zigzag_ring_attn(self, + query=query.view(-1, self.num_heads, self.head_size), + key=key.view(-1, self.num_kv_heads, self.head_size), + value=value.view(-1, self.num_kv_heads, self.head_size), + kv_cache=kv_cache, + attn_metadata=attn_metadata) + ''' + ======================= + End of Context Parallel + ======================= + ''' + return vllm__attention__backends__flash_attn__MLUFlashAttentionImpl__forward_org(self, + query=query, + key=key, + value=value, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + k_scale=k_scale, + v_scale=v_scale, + attn_type=attn_type) + + +MluHijackObject.apply_hijack(MLUFlashAttentionImpl_V2, + MLUFlashAttentionImpl_V2.forward, + vllm__attention__backends__flash_attn__MLUFlashAttentionImpl__forward_wraper) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/backends/ring_attn.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/backends/ring_attn.py new file mode 100644 index 0000000..e676914 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/attention/backends/ring_attn.py @@ -0,0 +1,216 @@ +from typing import List, Optional, Tuple +import torch +import torch.nn.functional as F +from vllm import _mlu_ops as mlu_ops +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.backends.mlu_attn import MLUFlashAttentionMetadata +from vllm.attention.ops.paged_attn import PagedAttention +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import get_context_model_parallel_group +from ...distributed.ring_comm import RingComm + + +# code references: https://github.com/zhuzilin/ring-flash-attention +def _update_out_and_lse( + out: torch.Tensor, + lse: torch.Tensor, + block_out: torch.Tensor, + block_lse: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + block_out = block_out.to(torch.float32) + block_lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1) + out = out - F.sigmoid(block_lse - lse) * (out - block_out) + lse = lse - F.logsigmoid(lse - block_lse) + return out, lse + + +def update_out_and_lse( + out: Optional[torch.Tensor], + lse: Optional[torch.Tensor], + block_out: torch.Tensor, + block_lse: torch.Tensor, + slice_=None, +) -> Tuple[torch.Tensor, torch.Tensor]: + if out is None: + if slice_ is not None: + raise RuntimeError("first update_out_and_lse should not pass slice_ args") + out = block_out.to(torch.float32) + lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1) + elif slice_ is not None: + slice_out, slice_lse = out[slice_], lse[slice_] + slice_out, slice_lse = _update_out_and_lse( + slice_out, slice_lse, block_out, block_lse + ) + out[slice_], lse[slice_] = slice_out, slice_lse + else: + out, lse = _update_out_and_lse(out, lse, block_out, block_lse) + return out, lse + + +def get_half(pack_tensor, cu_seq_lens, first_half): + batch_num = cu_seq_lens.shape[0] - 1 + half_list = [] + for batch in range(batch_num): + if first_half: + start = cu_seq_lens[batch] + end = (cu_seq_lens[batch] + cu_seq_lens[batch + 1]) // 2 + else: + start = (cu_seq_lens[batch] + cu_seq_lens[batch + 1]) // 2 + end = cu_seq_lens[batch + 1] + half = pack_tensor[start: end] + half_list.append(half) + half = torch.cat(half_list, dim=0) + return half + + +def update_half(pack_tensor, half_tensor, cu_seq_lens, first_half): + half_cu_seq_lens = cu_seq_lens // 2 + batch_num = cu_seq_lens.shape[0] - 1 + for batch in range(batch_num): + if first_half: + start = cu_seq_lens[batch] + end = (cu_seq_lens[batch] + cu_seq_lens[batch + 1]) // 2 + else: + start = (cu_seq_lens[batch] + cu_seq_lens[batch + 1]) // 2 + end = cu_seq_lens[batch + 1] + pack_tensor[start: end] = half_tensor[half_cu_seq_lens[batch]: half_cu_seq_lens[batch + 1]] + + +def zigzag_ring_attn(self, + query: torch.Tensor, # [num_tokens, num_heads, head_size] + key: torch.Tensor, # [num_tokens, num_heads. head_size] + value: torch.Tensor, # [num_tokens, num_heads, head_size] + kv_cache: List[torch.Tensor], + attn_metadata: MLUFlashAttentionMetadata) -> torch.Tensor: + num_tokens, _, _ = query.shape + cu_seq_lens = attn_metadata.prefill_metadata.seq_start_loc + batch_num = cu_seq_lens.shape[0] - 1 + block_seq_len = query.shape[0] // 2 + process_group = get_context_model_parallel_group().device_group + comm = RingComm(process_group) # k + comm_ = RingComm(process_group) # v + comm__ = RingComm(process_group) # slot_mapping + + q, k, v = query, key, value + if batch_num == 1: + q1 = q[block_seq_len:] + else: + q1 = get_half(q, cu_seq_lens, False) + slot_mapping = attn_metadata.slot_mapping + + out = None + lse = None + next_k, next_v = None, None + next_slot_mapping = None + + def forward(q, k, v, causal): + if batch_num == 1: + seq = q.shape[0] + seq_k = k.shape[0] + cu_seq_lens_q = torch.arange(0, seq+1, seq, dtype=torch.int32, device=q.device) + cu_seq_lens_kv = torch.arange(0, seq_k+1, seq_k, dtype=torch.int32, device=q.device) + max_seq_len_q = seq + max_seq_len_kv = seq_k + else: + max_seq_len_q = attn_metadata.prefill_metadata.max_seq_len + max_seq_len_kv = attn_metadata.prefill_metadata.max_seq_len + cu_seq_lens_q = cu_seq_lens + cu_seq_lens_kv = cu_seq_lens + if q.shape[0] != cu_seq_lens[-1]: + cu_seq_lens_q = cu_seq_lens // 2 + max_seq_len_q = max_seq_len_q // 2 + if k.shape[0] != cu_seq_lens[-1]: + cu_seq_lens_kv = cu_seq_lens // 2 + max_seq_len_kv = max_seq_len_kv // 2 + alibi_slopes = None if self.alibi_slopes is None else \ + self.alibi_slopes.repeat(attn_metadata.num_prefills, 1) + ouptuts = mlu_ops.flash_attention(q, + k, + v, + None, + cu_seq_lens_q, + cu_seq_lens_kv, + alibi_slopes, + None, + max_seq_len_q, + max_seq_len_kv, + self.scale, + causal, -1, -1, torch.float, True) + block_out, block_lse = ouptuts[0], ouptuts[1] + + if block_lse.shape[0] == 1: + block_lse = block_lse[0] + else: + # block_lse shape is [batch, head_num_q, max_seq_q], the empty part will set 0 + # we need to modify the shape to [batch, head_num_q, total_seq_q] + block_lse_list = [] + for batch in range(block_lse.shape[0]): + block_lse_ = block_lse[batch][:, : cu_seq_lens_q[batch + 1] - cu_seq_lens_q[batch]] + block_lse_list.append(block_lse_) + block_lse = torch.cat(block_lse_list, dim=-1) + + return block_out, block_lse + + for step in range(comm.world_size): + if step + 1 != comm.world_size: + next_k: torch.Tensor = comm.send_recv(k.contiguous()) + next_v: torch.Tensor = comm_.send_recv(v.contiguous()) + next_slot_mapping: torch.Tensor = comm__.send_recv(slot_mapping) + comm.commit() + comm_.commit() + comm__.commit() + + # call mlu_ops.reshape_paged_cache + if kv_cache[0].numel() > 0: + kv_cache_, kv_cache_scale_ = kv_cache + key_cache, value_cache = kv_cache_[0], kv_cache_[1] + if isinstance(kv_cache[0], torch.Tensor) and kv_cache[0].dtype == torch.int8: + key_cache_scale, value_cache_scale = kv_cache_scale_[0], kv_cache_scale_[1] + mlu_ops.quant_to_paged_cache(k, + v, + key_cache, + value_cache, + key_cache_scale, + value_cache_scale, + slot_mapping.flatten()) + else: + mlu_ops.reshape_paged_cache(k, + v, + key_cache, + value_cache, + slot_mapping.flatten()) + + if step == 0: + block_out, block_lse = forward(q, k, v, causal = True) + out, lse = update_out_and_lse(out, lse, block_out, block_lse) + elif step <= comm.rank: + if batch_num == 1: + k0 = k[:block_seq_len] + v0 = v[:block_seq_len] + else: + k0 = get_half(k, cu_seq_lens, True) + v0 = get_half(v, cu_seq_lens, True) + block_out, block_lse = forward(q, k0, v0, causal = False) + out, lse = update_out_and_lse(out, lse, block_out, block_lse) + else: + block_out, block_lse = forward(q1, k, v, causal = False) + if batch_num == 1: + out, lse = update_out_and_lse(out, lse, block_out, block_lse, + slice_=(slice(block_seq_len, None)),) + else: + slice_out = get_half(out, cu_seq_lens, False) + slice_lse = get_half(lse, cu_seq_lens, False) + slice_out, slice_lse = update_out_and_lse( + slice_out, slice_lse, block_out, block_lse + ) + update_half(out, slice_out, cu_seq_lens, False) + update_half(lse, slice_lse, cu_seq_lens, False) + + if step + 1 != comm.world_size: + comm.wait() + comm_.wait() + comm__.wait() + k = next_k + v = next_v + slot_mapping = next_slot_mapping + out = out.to(q.dtype) + return out.view(num_tokens, self.num_heads * self.head_size) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/distributed/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/distributed/__init__.py new file mode 100644 index 0000000..bf88805 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/distributed/__init__.py @@ -0,0 +1 @@ +from . import ring_comm \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/distributed/ring_comm.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/distributed/ring_comm.py new file mode 100644 index 0000000..230bc7a --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/distributed/ring_comm.py @@ -0,0 +1,50 @@ +from typing import Optional +import torch +import torch.distributed as dist +import torch.nn.functional as F + + +# code references: https://github.com/zhuzilin/ring-flash-attention +class RingComm: + def __init__(self, process_group: dist.ProcessGroup): + self._process_group = process_group + self._ops = [] + self.rank = dist.get_rank(self._process_group) + self.world_size = dist.get_world_size(self._process_group) + self._reqs = None + + self.send_rank = (self.rank + 1) % self.world_size + self.recv_rank = (self.rank - 1) % self.world_size + + if process_group is not None: + self.send_rank = dist.get_global_rank(self._process_group, self.send_rank) + self.recv_rank = dist.get_global_rank(self._process_group, self.recv_rank) + + def send_recv( + self, to_send: torch.Tensor, recv_tensor: Optional[torch.Tensor] = None + ) -> torch.Tensor: + if recv_tensor is None: + res = torch.empty_like(to_send) + else: + res = recv_tensor + + send_op = dist.P2POp( + dist.isend, to_send, self.send_rank, group=self._process_group + ) + recv_op = dist.P2POp(dist.irecv, res, self.recv_rank, group=self._process_group) + self._ops.append(send_op) + self._ops.append(recv_op) + return res + + def commit(self): + if self._reqs is not None: + raise RuntimeError("commit called twice") + self._reqs = dist.batch_isend_irecv(self._ops) + + def wait(self): + if self._reqs is None: + raise RuntimeError("wait called before commit") + for req in self._reqs: + req.wait() + self._reqs = None + self._ops = [] \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/__init__.py new file mode 100644 index 0000000..d31a606 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/__init__.py @@ -0,0 +1,2 @@ +from . import gpu_executor +from . import ray_mlu_executor diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/gpu_executor.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/gpu_executor.py new file mode 100644 index 0000000..ca911e4 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/gpu_executor.py @@ -0,0 +1,40 @@ +from typing import Any, Dict, Optional + +from vllm.executor.gpu_executor import GPUExecutor +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +def vllm__executor__gpu_executor__GPUExecutor___get_worker_kwargs( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None, +) -> Dict[str, Any]: + """Return worker init args for a given rank.""" + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: replace self.parallel_config.tensor_parallel_size with self.parallel_config.world_size. + ''' + return dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.world_size == 0), + ) + ''' + ======================= + End of Context Parallel + ======================= + ''' + + +MluHijackObject.apply_hijack( + GPUExecutor, + GPUExecutor._get_worker_kwargs, + vllm__executor__gpu_executor__GPUExecutor___get_worker_kwargs) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/ray_mlu_executor.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/ray_mlu_executor.py new file mode 100644 index 0000000..a97b438 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/executor/ray_mlu_executor.py @@ -0,0 +1,246 @@ +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, List, Optional + +import vllm.envs as envs +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + get_vllm_instance_id) +from vllm_mlu._mlu_utils import VLLM_LATENCY_DEBUG, VLLM_LATENCY_DEBUG_NO_DEVICE +from examples.cambricon_custom_func.vllm.mlu_hijack.common import init_logger +from vllm.executor.ray_mlu_executor import RayMLUExecutor +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +def vllm__executor__ray_mlu_executor__RayMLUExecutor___init_workers_ray( + self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if (self.parallel_config.tensor_parallel_size == 1 + and self.parallel_config.pipeline_parallel_size == 1): + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Used in ray compiled DAG: indexed first by PP rank, + # and then TP rank. In other words, the inner list is + # the TP group of workers for a PP rank. + self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] + + if self.parallel_config.ray_workers_use_nsight: + ray_remote_kwargs = self._configure_ray_workers_use_nsight( + ray_remote_kwargs) + + logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) + + # Create the workers. + driver_ip = get_ip() + worker_wrapper_kwargs = self._get_worker_wrapper_args() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) + + if self.use_ray_spmd_worker: + self.workers.append(worker) + else: + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + **worker_wrapper_kwargs) + else: + # Else, added to the list of workers. + self.workers.append(worker) + + logger.debug("workers: %s", self.workers) + logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) + if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + worker_ips = [ + ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] + for worker in self.workers + ] + ip_counts: Dict[str, int] = {} + for ip in worker_ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + + def sort_by_driver_then_worker_ip(worker): + """ + Sort the workers based on 3 properties: + 1. If the worker is on the same node as the driver (vllm engine), + it should be placed first. + 2. Then, if the worker is on a node with fewer workers, it should + be placed first. + 3. Finally, if the work is on a node with smaller IP address, it + should be placed first. + """ + ip = ray.get(worker.get_node_ip.remote()) + return (ip != driver_ip, ip_counts[ip], ip) + + # After sorting, the workers on the same node will be + # close to each other, and the workers on the driver + # node will be placed first. + self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) # node id -> list of worker ranks + node_gpus = defaultdict(list) # node id -> list of gpu ids + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + all_ips = set(worker_ips + [driver_ip]) + n_ips = len(all_ips) + n_nodes = len(node_workers) + + if n_nodes != n_ips: + raise RuntimeError( + f"Every node should have a unique IP address. Got {n_nodes}" + f" nodes with node ids {list(node_workers.keys())} and " + f"{n_ips} unique IP addresses {all_ips}. Please check your" + " network configuration. If you set `VLLM_HOST_IP` or " + "`HOST_IP` environment variable, make sure it is unique for" + " each node.") + + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "MLU_VISIBLE_DEVICES": + ",".join(map(str, node_gpus[node_id])), + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + **({ + "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND + } if envs.VLLM_ATTENTION_BACKEND is not None else {}), + "VLLM_LATENCY_DEBUG": + '1' if VLLM_LATENCY_DEBUG else '0', + "VLLM_LATENCY_DEBUG_NO_DEVICE": + '1' if VLLM_LATENCY_DEBUG_NO_DEVICE else '0', + }, ) for (node_id, _) in worker_node_and_gpu_ids] + + self._env_vars_for_all_workers = ( + all_args_to_update_environment_variables) + + self._run_workers("update_environment_variables", + all_args=self._get_env_vars_to_be_updated()) + + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + if self.use_ray_spmd_worker: + for pp_rank in range(self.parallel_config.pipeline_parallel_size): + self.pp_tp_workers.append([]) + for tp_rank in range( + self.parallel_config.tensor_parallel_size): + # PP=2, TP=4 + # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] + rank = (pp_rank * self.parallel_config.tensor_parallel_size + ) + tp_rank + assert len(self.pp_tp_workers[pp_rank]) == tp_rank + assert pp_rank < len(self.pp_tp_workers) + self.pp_tp_workers[pp_rank].append(self.workers[rank]) + + # This is the list of workers that are rank 0 of each TP group EXCEPT + # global rank 0. These are the workers that will broadcast to the + # rest of the workers. + self.tp_driver_workers: List[RayWorkerWrapper] = [] + # This is the list of workers that are not drivers and not the first + # worker in a TP group. These are the workers that will be + # broadcasted to. + self.non_driver_workers: List[RayWorkerWrapper] = [] + + # Enforce rank order for correct rank to return final output. + for index, worker in enumerate(self.workers): + # The driver worker is rank 0 and not in self.workers. + rank = index + 1 + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: replace tp size with world_size. + ''' + if rank % self.parallel_config.world_size == 0: + self.tp_driver_workers.append(worker) + else: + self.non_driver_workers.append(worker) + ''' + ======================= + End of Context Parallel + ======================= + ''' + +MluHijackObject.apply_hijack(RayMLUExecutor, + RayMLUExecutor._init_workers_ray, + vllm__executor__ray_mlu_executor__RayMLUExecutor___init_workers_ray) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/mlu_hijack.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/mlu_hijack.py new file mode 100644 index 0000000..79e622d --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/mlu_hijack.py @@ -0,0 +1,6 @@ +print("Apply Context Parallel Demo!") +from . import distributed +from . import attention +from . import model_executor +from . import worker +from . import executor \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/__init__.py new file mode 100644 index 0000000..7de80cc --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/__init__.py @@ -0,0 +1,2 @@ +from .layers import rotary_embedding +from .layers import logits_processor \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/layers/logits_processor.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/layers/logits_processor.py new file mode 100644 index 0000000..64f8726 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/layers/logits_processor.py @@ -0,0 +1,110 @@ +from typing import Optional +import torch +import vllm +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.distributed import get_world_group +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.attention import AttentionMetadata +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.layers.logits_processor import LogitsProcessor, _prune_hidden_states, _apply_logits_processors +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_context_model_parallel_world_size, get_context_model_parallel_rank, get_tensor_model_parallel_world_size) + + +def vllm__module_executor__layers__logits_processor__LogitsProcessor__forward_wraper( + self, + lm_head: VocabParallelEmbedding, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + embedding_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + if self.logits_as_input: + logits = hidden_states + else: + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: context parallel requires special handling of hidden_states and logits + ''' + if self.attn_metadata and get_context_model_parallel_world_size() > 1: + hidden_states = _prune_hidden_states_context_parallel(hidden_states, sampling_metadata, self.attn_metadata) + else: + hidden_states = _prune_hidden_states(hidden_states, sampling_metadata) + ''' + ======================= + End of Context Parallel + ======================= + ''' + # Get the logits for the next tokens. + logits = self._get_logits(hidden_states, lm_head, embedding_bias) + if logits is not None: + if self.soft_cap is not None: + logits = logits / self.soft_cap + logits = torch.tanh(logits) + logits = logits * self.soft_cap + + if self.scale != 1.0: + logits *= self.scale + + # Apply logits processors (if any). + if sampling_metadata is not None: + logits = _apply_logits_processors(logits, sampling_metadata) + + return logits + + +''' +========================== +Modify by Context Parallel +========================== +@brief: token num can be divisible by context_parallel_size * 2 after padding, + and then split to context parallel groups with zigzag method, now we + need to find the last valid tokens, and get the logits for the next tokens. +''' +def _prune_hidden_states_context_parallel( + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + attn_metadata: AttentionMetadata +) -> torch.Tensor: + select_hidden_states_list = [] + seq_start_loc = attn_metadata.prefill_metadata.seq_start_loc + batch_num = seq_start_loc.shape[0] - 1 + for batch in range(batch_num): + start = seq_start_loc[batch] + end = seq_start_loc[batch + 1] + hidden_states_ = hidden_states[start : end] + split_seq_len = hidden_states_.shape[0] // 2 + seq_len = attn_metadata.prefill_metadata.seq_lens[batch] + last_id = seq_len - 1 + idx = last_id // split_seq_len + select_hidden_states = torch.zeros((1, hidden_states.shape[-1]), dtype = hidden_states.dtype, device = hidden_states.device) + if idx < get_context_model_parallel_world_size(): + target_cp_id = idx + src_rank = get_tensor_model_parallel_world_size() * target_cp_id + if get_context_model_parallel_rank() == target_cp_id: + selected_token_indices = last_id - idx * split_seq_len + select_hidden_states = hidden_states_[selected_token_indices].unsqueeze(0) + else: + target_cp_id = get_context_model_parallel_world_size() * 2 - 1 - idx + src_rank = get_tensor_model_parallel_world_size() * target_cp_id + if get_context_model_parallel_rank() == target_cp_id: + selected_token_indices = last_id - idx * split_seq_len + split_seq_len + select_hidden_states = hidden_states_[selected_token_indices].unsqueeze(0) + + select_hidden_states = get_world_group().broadcast(select_hidden_states, src = src_rank) + select_hidden_states_list.append(select_hidden_states) + + select_hidden_states = torch.cat(select_hidden_states_list, dim=0) + return select_hidden_states +''' +======================= +End of Context Parallel +======================= +''' + + +MluHijackObject.apply_hijack(LogitsProcessor, + LogitsProcessor.forward, + vllm__module_executor__layers__logits_processor__LogitsProcessor__forward_wraper) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/layers/rotary_embedding.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/layers/rotary_embedding.py new file mode 100644 index 0000000..f6dda0a --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/model_executor/layers/rotary_embedding.py @@ -0,0 +1,62 @@ +from typing import Optional, Tuple +import torch +import vllm +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm_mlu.model_executor.layers.rotary_embedding import MLURotaryEmbedding +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_context_model_parallel_world_size) + +def vllm__module_executor__layers__rotary_embedding__MLURotaryEmbedding__forward_mlu_wraper( + self, + positions: torch.Tensor, + x: torch.Tensor, + offsets: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + from vllm import _mlu_ops as mlu_ops + + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + raise ValueError(f"tmo.apply_rotary not support offsets yet.") + else: + if MLURotaryEmbedding.set_cos_sin == False: + MLURotaryEmbedding.cos_, MLURotaryEmbedding.sin_ = self._get_cos_sin() + MLURotaryEmbedding.set_cos_sin = True + interleaved = True + if self.is_neox_style: + interleaved = False + if MLURotaryEmbedding.is_chunked or not MLURotaryEmbedding.is_prompt: + position_ids = positions + discrete = True + else : + position_ids = None + discrete = False + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: context parallel need discrete = True + ''' + position_ids = None if (MLURotaryEmbedding.is_prompt and get_context_model_parallel_world_size == 1) else positions + discrete = False if (MLURotaryEmbedding.is_prompt and get_context_model_parallel_world_size == 1) else True + ''' + ======================= + End of Context Parallel + ======================= + ''' + x = mlu_ops.rotary_embedding(x, + MLURotaryEmbedding.sin_, + MLURotaryEmbedding.cos_, + position_ids, + MLURotaryEmbedding.cu_seq_lens, + interleaved, + discrete, + False, + MLURotaryEmbedding.max_seq_len) + + return x + + +MluHijackObject.apply_hijack(MLURotaryEmbedding, + MLURotaryEmbedding.forward_mlu, + vllm__module_executor__layers__rotary_embedding__MLURotaryEmbedding__forward_mlu_wraper) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/__init__.py new file mode 100644 index 0000000..61381c7 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/__init__.py @@ -0,0 +1,5 @@ +from . import mlu_model_runner +from . import model_runner +from . import model_runner_base +from . import worker +from . import worker_base diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/mlu_model_runner.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/mlu_model_runner.py new file mode 100644 index 0000000..804c539 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/mlu_model_runner.py @@ -0,0 +1,256 @@ +import torch +from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, + Tuple, Type, TypeVar, Union) +from vllm.forward_context import set_forward_context +from vllm.multimodal.inputs import MultiModalKwargs +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm_mlu._mlu_utils import * +from vllm.worker.model_runner import ( + TModelInputForGPU, ModelInputForGPU, + ModelInputForGPUWithSamplingMetadata, + ModelInputForGPUBuilder, GPUModelRunnerBase, + ModelRunner, CUDAGraphRunner, + LORA_WARMUP_RANK, _get_graph_batch_size, + _BATCH_SIZES_TO_CAPTURE, _NUM_WARMUP_ITERS +) +from vllm.worker.mlu_model_runner import MLUModelRunner +from vllm.sequence import (IntermediateTensors, SequenceGroupMetadata) +from vllm.distributed import get_pp_group +from vllm.model_executor.layers.sampler import SamplerOutput +from ..zigzag_utils import get_context_model_parallel_world_size, zigzag_split +import vllm.envs as envs + +try: + from flashinfer import BatchDecodeWithPagedKVCacheWrapper + from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper + from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper + FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 +except ImportError: + BatchDecodeWithPagedKVCacheWrapper = None + CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None + BatchPrefillWithPagedKVCacheWrapper = None + FLASHINFER_WORKSPACE_BUFFER_SIZE = 0 + +_PAD_SLOT_ID = -1 + +@torch.inference_mode() +def vllm__worker__mlu_model_runner__MLUModelRunner__execute_model( + self, + model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, +) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError("num_steps > 1 is not supported in ModelRunner") + + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + + if self.prompt_adapter_config: + assert model_input.prompt_adapter_requests is not None + assert model_input.prompt_adapter_mapping is not None + self.set_active_prompt_adapters( + model_input.prompt_adapter_requests, + model_input.prompt_adapter_mapping) + + self.attn_state.begin_forward(model_input) + + # Currently cuda graph is only supported by the decode phase. + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata + # TODO(andoorve): We can remove this once all + # virtual engines share the same kv cache. + virtual_engine = model_input.virtual_engine + if prefill_meta is None and decode_meta.use_cuda_graph: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = self.graph_runners[virtual_engine][ + graph_batch_size] + else: + model_executable = self.model + + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + seqlen_agnostic_kwargs = { + "finished_requests_ids": model_input.finished_requests_ids, + "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, + } if self.has_inner_state else {} + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_start = torch.mlu.Event(enable_timing=True) + model_forward_end = torch.mlu.Event(enable_timing=True) + model_forward_start.record() + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add mlu metrics + ''' + # Add time markers for model_executable+compute_logits + if VLLM_LATENCY_DEBUG_WITH_DEVICE_EN: + use_cuda_graph = ((prefill_meta is None and decode_meta.use_cuda_graph) + or use_context_mlugraph) + # if use_cuda_graph, the start timestamp will be inserted inside MLUGraphRunner.forward() + if not use_cuda_graph: + start = torch.mlu.Event(enable_timing=True) + start.record() + + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: context parallel split input for model with zigzag method + ''' + if get_context_model_parallel_world_size() > 1 and model_input.attn_metadata.prefill_metadata: + with set_forward_context(model_input.attn_metadata): + zigzag_input_ids, zigzag_positions, zigzag_attn_metadata = zigzag_split(model_input.input_tokens, + model_input.input_positions, + model_input.attn_metadata, _PAD_SLOT_ID) + hidden_or_intermediate_states = model_executable( + input_ids=zigzag_input_ids, + positions=zigzag_positions, + kv_caches=kv_caches, + attn_metadata=zigzag_attn_metadata, + intermediate_tensors=intermediate_tensors, + **multi_modal_kwargs, + **seqlen_agnostic_kwargs) + else: + with set_forward_context(model_input.attn_metadata): + hidden_or_intermediate_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, + device=self.device), + **seqlen_agnostic_kwargs) + + ################################################################################################# + # DEBUG # + ################################################################################################# + # import os + # from vllm.distributed import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank + # from from examples.cambricon_custom_funcvllm.mlu_hijack.distributed.parallel_state import ( + # get_context_model_parallel_rank) + # from ..zigzag_utils import context_parallel_tensor_all_gather, diff1 + # if get_context_model_parallel_world_size() > 1 and attn_metadata.prefill_metadata: + # hidden_states = context_parallel_tensor_all_gather(hidden_states, zigzag_attn_metadata, dim=0) + # if attn_metadata.prefill_metadata and (kv_caches[0] is not None): + # file_path = '/workspace/output_base_' + str(hidden_states.shape) + \ + # '_tp_' + str(get_tensor_model_parallel_world_size()) + '.pth' + # if get_context_model_parallel_rank() == 0 and get_tensor_model_parallel_rank() == 0: + # if os.path.exists(file_path): + # print("##################compare################") + # hidden_states_base = torch.load(file_path) + # print("########output_diff1: ", diff1(hidden_states, hidden_states_base)) + # else: + # print("##################save base################") + # torch.save(hidden_states, file_path) + + ''' + @brief: logits_processor in context parallel need attn_metadata param + ''' + if get_context_model_parallel_world_size() > 1 and model_input.attn_metadata.prefill_metadata: + setattr(self.model.logits_processor, 'attn_metadata', zigzag_attn_metadata) + else: + setattr(self.model.logits_processor, 'attn_metadata', None) + ''' + ======================= + End of Context Parallel + ======================= + ''' + + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_end.record() + + # Compute the logits in the last pipeline stage. + if not get_pp_group().is_last_rank: + if (self.is_driver_worker + and hidden_or_intermediate_states is not None + and isinstance(hidden_or_intermediate_states, + IntermediateTensors) + and self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_end.synchronize() + model_forward_time = model_forward_start.elapsed_time( + model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() + hidden_or_intermediate_states.tensors["model_forward_time"] = ( + torch.tensor(model_forward_time + orig_model_forward_time)) + return hidden_or_intermediate_states + + logits = self.model.compute_logits(hidden_or_intermediate_states, + model_input.sampling_metadata) + + # Add time markers for model_executable+compute_logits + if VLLM_LATENCY_DEBUG_WITH_DEVICE_EN: + end_marker = torch.mlu.Event(enable_timing=True) + end_marker.record() + if use_cuda_graph: + self.time_markers = (model_executable.start, end_marker) + else: + self.time_markers = (start, end_marker) + ''' + ================== + End of MLU Hijack + ================== + ''' + if not self.is_driver_worker: + return [] + + if model_input.async_callback is not None: + model_input.async_callback() + + # Sample the next token. + output: SamplerOutput = self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time + and output is not None): + model_forward_end.synchronize() + model_forward_time = model_forward_start.elapsed_time( + model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() + # If there are multiple workers, we are still tracking the latency + # from the start time of the driver worker to the end time of the + # driver worker. The model forward time will then end up covering + # the communication time as well. + output.model_forward_time = (orig_model_forward_time + + model_forward_time) + + + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + assert model_input.sampling_metadata is not None + indices = model_input.sampling_metadata.selected_token_indices + if model_input.is_prompt: + hidden_states = hidden_or_intermediate_states.index_select( + 0, indices) + elif decode_meta.use_cuda_graph: + hidden_states = hidden_or_intermediate_states[:len(indices)] + else: + hidden_states = hidden_or_intermediate_states + + output.hidden_states = hidden_states + + return [output] + + +MluHijackObject.apply_hijack(MLUModelRunner, + MLUModelRunner.execute_model, + vllm__worker__mlu_model_runner__MLUModelRunner__execute_model) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/model_runner.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/model_runner.py new file mode 100644 index 0000000..cff8beb --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/model_runner.py @@ -0,0 +1,35 @@ +from typing import (Any, Dict, Optional) + +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata +from examples.cambricon_custom_func.context_parallel.mlu_hijack.worker.model_runner_base import vllm__worker__model_runner_base___init_sampling_metadata_from_tensor_dict +from vllm.worker.model_runner_base import _init_attn_metadata_from_tensor_dict + +@classmethod +def vllm__worker__model_runner__ModelInputForGPUWithSamplingMetadata__from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, +) -> "ModelInputForGPUWithSamplingMetadata": + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: force apply hijacked function. + ''' + tensor_dict = vllm__worker__model_runner_base___init_sampling_metadata_from_tensor_dict(tensor_dict) + ''' + ======================= + End of Context Parallel + ======================= + ''' + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + +MluHijackObject.apply_hijack( + ModelInputForGPUWithSamplingMetadata, + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict, + vllm__worker__model_runner__ModelInputForGPUWithSamplingMetadata__from_broadcasted_tensor_dict +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/model_runner_base.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/model_runner_base.py new file mode 100644 index 0000000..b43d804 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/model_runner_base.py @@ -0,0 +1,74 @@ +from typing import (Any, Dict) + +from vllm.model_executor.sampling_metadata import SequenceGroupToSample +from vllm.worker import model_runner_base +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +def vllm__worker__model_runner_base___init_sampling_metadata_from_tensor_dict( # type: ignore + tensor_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Helper method to initialize SamplingMetadata based on broadcastable + SamplingMetadata fields. + """ + from vllm.model_executor import SamplingMetadata + + selected_token_indices = tensor_dict.pop("selected_token_indices", None) + if selected_token_indices is not None: + if 'seq_group_metadata' in tensor_dict.keys() and len(tensor_dict['seq_group_metadata']) > 0: + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: construct sampling metadata. + ''' + sequence_group_to_sample_list = [] + for seq_group_metadata in tensor_dict['seq_group_metadata']: + seq_ids = list(seq_group_metadata.seq_data.keys()) + sampling_params = seq_group_metadata.sampling_params + seq_data = seq_group_metadata.seq_data + is_prompt = seq_group_metadata.is_prompt + if is_prompt: + seq_len = query_len = list(seq_data.values())[0].get_prompt_len() + else: + seq_len = None + query_len = 1 + prompt_logprob_indices = [] + sample_indices = seq_ids + sequence_group_to_sample = SequenceGroupToSample(seq_ids, + sampling_params, + seq_data, + seq_len, + query_len, + None, # Generator + is_prompt, + prompt_logprob_indices, + sample_indices) + sequence_group_to_sample_list.append(sequence_group_to_sample) + tensor_dict["sampling_metadata"] = SamplingMetadata( + seq_groups=sequence_group_to_sample_list, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=len(sequence_group_to_sample_list), + ) + del tensor_dict['seq_group_metadata'] + ''' + ======================= + End of Context Parallel + ======================= + ''' + else: + # An empty SamplingMetadata to signal that the worker should skip + # sampling. + tensor_dict["sampling_metadata"] = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + return tensor_dict + +MluHijackObject.apply_hijack( + model_runner_base, + model_runner_base._init_sampling_metadata_from_tensor_dict, + vllm__worker__model_runner_base___init_sampling_metadata_from_tensor_dict +) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/worker.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/worker.py new file mode 100644 index 0000000..4be2330 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/worker.py @@ -0,0 +1,23 @@ +from vllm.worker.worker import Worker +from vllm_mlu.mlu_hijack_utils import MluHijackObject + +@property +def vllm__worker__worker__Worker__do_metadata_broadcast(self) -> bool: + ''' + ============================= + Modify by Context Parallel + ============================= + @brief: do metadata broadcast if cp or tp > 1. + ''' + return self.parallel_config.world_size > 1 + ''' + ========================== + End of Context Parallel + ========================== + ''' + + +MluHijackObject.apply_hijack( + Worker, + Worker.do_metadata_broadcast, + vllm__worker__worker__Worker__do_metadata_broadcast) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/worker_base.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/worker_base.py new file mode 100644 index 0000000..f2de41f --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/worker/worker_base.py @@ -0,0 +1,121 @@ +import dataclasses +from typing import Any, Dict, Optional, Tuple, Union + +import torch + +from vllm.config import ObservabilityConfig, VllmConfig +from vllm.distributed.parallel_state import get_world_group +from vllm.sequence import ExecuteModelRequest +from vllm.worker.model_runner_base import (BroadcastableModelInput, + ModelRunnerInputBase) +from vllm.worker.worker_base import (extract_previous_hidden_states, + LocalOrDistributedWorkerBase, + WorkerInput) +from vllm_mlu.mlu_hijack_utils import MluHijackObject + + +def broadcast_tensor_dict( + tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, + src: int = 0 +): + if not torch.distributed.is_initialized(): + return tensor_dict + return get_world_group().broadcast_tensor_dict(tensor_dict, src) + +def vllm__worker__worker_base__LocalOrDistributedWorkerBase___get_driver_input_and_broadcast( + self, execute_model_req: ExecuteModelRequest +) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]: + """ Get the driver input and broadcast it to other workers. """ + assert self.is_driver_worker + + worker_input: WorkerInput = self.prepare_worker_input( + execute_model_req=execute_model_req) + model_input: ModelRunnerInputBase = ( + self.model_runner.prepare_model_input( + execute_model_req.seq_group_metadata_list, + execute_model_req.virtual_engine, + execute_model_req.finished_requests_ids)) + + kwargs = extract_previous_hidden_states(execute_model_req) + + if self.do_metadata_broadcast: + broadcast_data = worker_input.as_broadcastable_tensor_dict() + broadcast_data.update(model_input.as_broadcastable_tensor_dict()) + broadcast_data.update(kwargs) + ''' + ========================== + Modify by Context Parallel + ========================== + @brief: add seq_group metadata to broadcast. + ''' + broadcast_data['seq_group_metadata'] = execute_model_req.seq_group_metadata_list + ''' + ======================= + End of Context Parallel + ======================= + ''' + broadcast_tensor_dict(broadcast_data, src=0) + + if execute_model_req.async_callback: + model_input = dataclasses.replace( # type: ignore + model_input, + async_callback=execute_model_req.async_callback) + + return model_input, worker_input, kwargs + +def vllm__worker__worker_base__LocalOrDistributedWorkerBase___get_worker_input_from_broadcast( + self +) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[ + str, torch.Tensor]]]: + """ Get the worker input from the broadcasted tensor dict. """ + assert self.do_metadata_broadcast + assert not self.is_driver_worker + broadcast_data = broadcast_tensor_dict(src=0) + if not broadcast_data: + return None + + worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data) + model_input = ( + self.model_runner.make_model_input_from_broadcasted_tensor_dict( + broadcast_data)) + + kwargs = extract_previous_hidden_states(broadcast_data) + + return model_input, worker_input, kwargs + + +def vllm__worker__worker_base__LocalOrDistributedWorkerBase__prepare_input( + self, + execute_model_req: Optional[ExecuteModelRequest] = None +) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]]: + """ + Prepare the inputs to ModelRunner and workers. + """ + if self.is_driver_worker: + if execute_model_req is None: + if self.do_metadata_broadcast: + # This signals that there's no more requests to process for + # now. All workers are running infinite loop with + # broadcast_tensor_dict, and it stops the loop when the + # driver broadcasts an empty input. Send an empty input to + # notify all other workers to stop their execution loop. + broadcast_tensor_dict({}, src=0) + return None + return self._get_driver_input_and_broadcast(execute_model_req) + else: + return self._get_worker_input_from_broadcast() + +MluHijackObject.apply_hijack( + LocalOrDistributedWorkerBase, + LocalOrDistributedWorkerBase._get_driver_input_and_broadcast, + vllm__worker__worker_base__LocalOrDistributedWorkerBase___get_driver_input_and_broadcast) + +MluHijackObject.apply_hijack( + LocalOrDistributedWorkerBase, + LocalOrDistributedWorkerBase._get_worker_input_from_broadcast, + vllm__worker__worker_base__LocalOrDistributedWorkerBase___get_worker_input_from_broadcast) + +MluHijackObject.apply_hijack( + LocalOrDistributedWorkerBase, + LocalOrDistributedWorkerBase.prepare_input, + vllm__worker__worker_base__LocalOrDistributedWorkerBase__prepare_input) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/zigzag_utils.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/zigzag_utils.py new file mode 100644 index 0000000..68f51a0 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/mlu_hijack/zigzag_utils.py @@ -0,0 +1,149 @@ +from typing import Dict, Optional, Sequence, List +import torch +import torch.distributed as dist +from torch import nn +from torch.nn import functional as F +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_context_model_parallel_rank, get_context_model_parallel_world_size, get_context_model_parallel_group) +from vllm.distributed.utils import divide +from vllm.attention import AttentionMetadata +import copy + + +def diff1(result: torch.Tensor, baseline: torch.Tensor): + result = result.flatten().float().to('cpu') + baseline = baseline.flatten().float().to('cpu') + assert result.shape == baseline.shape + error = torch.abs(baseline - result) + denominator = torch.sum(torch.abs(baseline)).item() + eps = 0.0 if denominator > 0 else 1e-9 + diff1 = torch.sum(error) / (denominator + eps) + return diff1.item() + + +def get_pad_seq(seq_len: int, pad: int): + return (seq_len // pad + (int)((seq_len) % (pad) > 0)) * pad + + +# Gather the partial results of a batch on context parallel groups +# together and place them in the order before zigzag splitting +def context_parallel_tensor_all_gather_(input_, dim=-1): + world_size = get_context_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + + assert input_size[dim] % 2 == 0, (f"input tensor split dim % 2 != 0") + + gather_list = [torch.empty(input_.shape, dtype=input_.dtype, device=input_.device) for _ in range(world_size)] + torch.distributed.all_gather( + gather_list, input_, group=get_context_model_parallel_group()) + + first = [] + second = [] + for i in range(world_size): + first_second = torch.split(gather_list[i], gather_list[i].shape[dim] // 2, dim=dim) + first.append(first_second[0]) + second.insert(0, first_second[1]) + tensor_list = first + second + output_tensor = torch.cat(tensor_list, dim = dim).contiguous() + return output_tensor + + +# Gather the partial results of each batch on the context parallel groups together, +# place them in the order before zigzag splitting, and remove the pad part. +# This function is used for debugging +def context_parallel_tensor_all_gather(input, attn_metadata, dim=-1): + if dim < 0: + dim += input.dim() + slice_ = () + for i in range(dim): + slice_ + (slice(None)) + select_list = [] + seq_start_loc = attn_metadata.prefill_metadata.seq_start_loc + batch_num = seq_start_loc.shape[0] - 1 + for batch in range(batch_num): + start = seq_start_loc[batch].item() + end = seq_start_loc[batch + 1].item() + slice1 = slice_ + (slice(start, end), ) + input_ = input[slice1] + gather_ = context_parallel_tensor_all_gather_(input_, dim=dim) + slice2 = slice_ + (slice(None, attn_metadata.prefill_metadata.seq_lens[batch]), ) + select = gather_[slice2] + select_list.append(select) + output = torch.cat(select_list, dim=dim) + return output + + +# Pad one dimension of a tensor so that it is divisible by context_parallel_size * 2, +# and then use zigzag method to split it into different context parallel groups +def zigzag_split_(tensor: torch.Tensor, dim = -1, pad_value=0): + if dim < 0: + dim = tensor.dim() + dim + split_num = get_context_model_parallel_world_size() * 2 + pad_num = get_pad_seq(tensor.shape[dim], split_num) - tensor.shape[dim] + pad_param = (0, 0) * (tensor.dim() - dim - 1) + (0, pad_num) + (0, 0) * dim + tensor_pad = F.pad(tensor, pad_param, value = pad_value) + split_size = divide(tensor_pad.size()[dim], split_num) + # Split. + tensor_list = torch.split(tensor_pad, split_size, dim = dim) + first = tensor_list[get_context_model_parallel_rank()] + second = tensor_list[split_num - get_context_model_parallel_rank() - 1] + output_tensor = torch.cat((first, second), dim=dim).contiguous() + return output_tensor + + +# Split each batch of input_ids, positions, attn_metadata.slot_mapping with zigzag method, +# and update prefill_metadata.seq_start_loc and prefill_metadata.max_seq_len +def zigzag_split(input_ids: torch.Tensor, + positions: torch.Tensor, + attn_metadata: AttentionMetadata, + pad_slot_id: int): + zigzag_input_ids: List[int] = [] + zigzag_positions: List[int] = [] + zigzag_slot_mapping: List[int] = [] + zigzag_attn_metadata = copy.deepcopy(attn_metadata) + seq_lens: List[int] = [] + seq_start_loc = attn_metadata.prefill_metadata.seq_start_loc + batch_num = seq_start_loc.shape[0] - 1 + for batch in range(batch_num): + start, end = seq_start_loc[batch], seq_start_loc[batch + 1] + input_ids_ = input_ids[start : end] + positions_ = positions[start : end] + zigzag_input_ids_ = zigzag_split_(input_ids_) + zigzag_positions_ = zigzag_split_(positions_) + zigzag_input_ids.append(zigzag_input_ids_) + zigzag_positions.append(zigzag_positions_) + seq_lens.append(zigzag_input_ids_.shape[0]) + slot_mapping_ = attn_metadata.slot_mapping[start : end] + zigzag_slot_mapping_ = zigzag_split_(slot_mapping_, pad_value=pad_slot_id) + zigzag_slot_mapping.append(zigzag_slot_mapping_) + + zigzag_input_ids = torch.cat(zigzag_input_ids, dim=0) + zigzag_positions = torch.cat(zigzag_positions, dim=0) + zigzag_slot_mapping = torch.cat(zigzag_slot_mapping, dim=0) + + max_seq_len = max(seq_lens) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=input_ids.device) + seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=input_ids.device) + torch.cumsum(seq_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + + zigzag_attn_metadata.prefill_metadata.seq_start_loc = seq_start_loc + zigzag_attn_metadata.prefill_metadata.query_start_loc = seq_start_loc + zigzag_attn_metadata.prefill_metadata.max_seq_len = max_seq_len + zigzag_attn_metadata.slot_mapping = zigzag_slot_mapping + + return zigzag_input_ids, zigzag_positions, zigzag_attn_metadata diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/offline_inference.py b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/offline_inference.py new file mode 100644 index 0000000..9d45546 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/context_parallel/offline_inference.py @@ -0,0 +1,25 @@ +import os +os.environ['CONTEXT_PARALLEL_EN'] = "True" + +from vllm import LLM, SamplingParams + +if __name__ == '__main__': + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, max_tokens=16) + # Create an LLM. + llm = LLM(model="/data/AE/llm/models/Llama-2-7b-hf/", enforce_eager=True, tensor_parallel_size = 2, context_parallel_size = 2, distributed_executor_backend='ray') + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/README.md b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/README.md new file mode 100644 index 0000000..0cc5d02 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/README.md @@ -0,0 +1,26 @@ +### 简介 + +该example是vLLM中进行Expert Parallel的实验,mlu_hijack是对仓库代码的劫持,避免修改主仓库代码 + +### 支持模型 + +- qwen2_moe +- mixtral +- custom model +- deepseek_v2 + +### 支持板卡 + +300系列设备只能用于功能测试,性能测试需要其他系列设备。 + +### 运行demo +```python +python examples/cambricon_custom_func/expert_parallel/offline_inference.py +``` + +### 使用Expert Parallel特性 + +- 设置环境变量export EXPERT_PARALLEL_EN=1|True|true|TRUE, LLM主接口传入tensor_parallel_size的同时,传入moe_tp_size或moe_ep_size,或两者都传; +- 若只传moe_tp_size和moe_ep_size中的一个,另一个等于tensor_parallel_size除以传入其中一个的除数,所以必须保证传入数可以被tensor_parallel_size整除; +- 若moe_tp_size和moe_ep_size都传入,则必须保证moe_tp_size * moe_ep_size == tensor_parallel_size; +- 若moe_tp_size和moe_ep_size都不传,则它们默认值等于-1,即不开启专家并行; diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu.sh b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu.sh new file mode 100644 index 0000000..a60b8e9 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +rm output -rf +mkdir output + +DATA_DIR=/data +MODELS_DEEPSEEK_V2=( + "${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2" +) + +MODELS=(${MODELS_DEEPSEEK_V2[@]}) + +# 定义变量 +use_ray=0 +use_eager=0 +use_pp=0 +# context parameter +input_sizes=(1024) +output_sizes=(1) +# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40) +batch_sizes=(1 4 8 16 32) + +# decoder parameter +# input_sizes=(1) +# output_sizes=(128) +# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048) +# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048) + +tp_sizes=(8) +moe_ep_sizes=(8 -1) +pp_sizes=(1) + +if [ $use_pp -gt 0 ]; then + tp_sizes=(1) + moe_ep_sizes=(-1) + pp_sizes=(8) + BENCHMARK_CMD=benchmarks/benchmark_throughput.py + benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine" +else + BENCHMARK_CMD=benchmarks/benchmark_latency.py + benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average" +fi + +max_position_embeddings=163840 + +#export MLU_VISIBLE_DEVICES=4,5,6,7 +export EXPERT_PARALLEL_EN=true +export VLLM_LATENCY_DEBUG=true +export VLLM_GRAPH_DEBUG=false +# export VLLM_DUMP_MLU_INFO=true +export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv + +ray_option="" +if [ $use_ray -gt 0 ]; then + ray_option="--distributed-executor-backend ray --ray-workers-use-nsight" +fi +eager_option="" +if [ $use_eager -gt 0 ]; then + eager_option="--enforce-eager" +fi + +# 遍历所有组合 +for HF_MODEL in "${MODELS[@]}"; do + quantization_option="" + if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then + quantization_option="--quantization=smoothquant" + fi + for tp_size in "${tp_sizes[@]}"; do + for moe_ep_size in "${moe_ep_sizes[@]}"; do + for pp_size in "${pp_sizes[@]}"; do + for input_size in "${input_sizes[@]}"; do + for output_size in "${output_sizes[@]}"; do + for batch_size in "${batch_sizes[@]}"; do + max_seq_len_to_capture=$(expr $input_size \+ $output_size) + max_num_batched_tokens=$(expr $batch_size \* $input_size) + max_model_len=$max_seq_len_to_capture + if [ $max_model_len -gt $max_position_embeddings ]; then + continue + fi + # max_num_seqs=256 + # if [ $max_num_seqs -lt $batch_size ]; then + # max_num_seqs=$batch_size + # fi + max_num_seqs=$batch_size + if [ $max_model_len -gt $max_num_batched_tokens ]; then + max_num_batched_tokens=$max_model_len + fi + if [ $max_num_seqs -gt $max_num_batched_tokens ]; then + max_num_batched_tokens=$max_num_seqs + fi + + pp_option="--pipeline-parallel-size ${pp_size}" + tp_option="-tp ${tp_size}" + ep_option="--moe-ep-size ${moe_ep_size}" + batch_size_option="" + if [ $use_pp -le 0 ]; then + batch_size_option="--batch-size ${batch_size}" + fi + + hf_model_name=$(basename "${HF_MODEL}") + LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log + echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}" + python3 ${BENCHMARK_CMD} \ + ${benchmark_option} \ + --trust-remote-code \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 16 \ + --model ${HF_MODEL} \ + --tokenizer ${HF_MODEL} \ + --dtype bfloat16 \ + --input-len ${input_size} \ + --output-len ${output_size} \ + ${pp_option} ${tp_option} ${ep_option} \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --max-num-seqs ${max_num_seqs} \ + ${batch_size_option} \ + ${eager_option} ${ray_option} ${quantization_option} \ + 2>&1 | tee ${LOG_FILE} + # 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks + if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then + echo "Found one or more specified errors in the log file." + break + else + echo "No specified errors found." + fi + done + done + done + done + done + done +done diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu_perf.sh b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu_perf.sh new file mode 100755 index 0000000..d6fd359 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/benchmark_latency_mlu_perf.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +rm output -rf +mkdir output + +DATA_DIR=/data +MODELS_DEEPSEEK_V2=( + "${DATA_DIR}/vllm/models/LLM-Research/deepseek-v2" +) + +MODELS=(${MODELS_DEEPSEEK_V2[@]}) + +# 定义变量 +use_ray=0 +use_eager=0 +use_pp=0 +use_kernel_analysis=0 +# context parameter +input_sizes=(1024) +output_sizes=(1) +# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40) +batch_sizes=(1 4 8 16 32) + +# decoder parameter +# input_sizes=(1) +# output_sizes=(128) +# batch_sizes=(1 2 4 8 16 32 64 128 256 512 1024 1280 1536 1600 1616 1632 1648 1652 1656 1660 1661 1662 1663 1664 1728 1792 2048) +# batch_sizes=(1 4 8 16 32 64 128 256 512 1024 2048) + +tp_sizes=(8) +moe_ep_sizes=(8 -1) +pp_sizes=(1) + +if [ $use_pp -gt 0 ]; then + tp_sizes=(1) + moe_ep_sizes=(-1) + pp_sizes=(8) + BENCHMARK_CMD=benchmarks/benchmark_throughput.py + benchmark_option="--backend vllm --num-prompts 1000 --output-json output_throughput.csv --async-engine" +else + BENCHMARK_CMD=benchmarks/benchmark_latency.py + benchmark_option="--num-iters-warmup 1 --num-iters 3 --only_average" +fi + +max_position_embeddings=163840 + +#export MLU_VISIBLE_DEVICES=4,5,6,7 +export EXPERT_PARALLEL_EN=true +export VLLM_LATENCY_DEBUG=true +export VLLM_GRAPH_DEBUG=false +# export VLLM_DUMP_MLU_INFO=true +export OUTPUT_CSV_PATH=/data/solution-sdk/kangpengtao/tmp/deepseek/output.csv + +ray_option="" +if [ $use_ray -gt 0 ]; then + ray_option="--distributed-executor-backend ray --ray-workers-use-nsight" +fi + +record_option="" +if [ $use_kernel_analysis -gt 0 ]; then + # ref: https://wiki.cambricon.com/pages/viewpage.action?pageId=434445235 + export CNPERF_KERNEL_ANALYSIS=1 + record_option="--pmu --capture_range=cnpx --cnpx_include kangpengtao --cnpx_exclude kangpengtao_exec --events tp_core__write_bytes,tp_core__read_bytes,tp_memcore__write_bytes,tp_memcore__read_bytes,tp_core__lt_cycles,tp_core__csimd_pre_cycles,tp_core__csimd_post_cycles" + use_eager=1 +fi + +eager_option="" +if [ $use_eager -gt 0 ]; then + eager_option="--enforce-eager" +fi + +# 遍历所有组合 +for HF_MODEL in "${MODELS[@]}"; do + quantization_option="" + if [[ "${HF_MODEL}" == *"sq_per_token_per_channel"* ]]; then + quantization_option="--quantization=smoothquant" + fi + for tp_size in "${tp_sizes[@]}"; do + for moe_ep_size in "${moe_ep_sizes[@]}"; do + for pp_size in "${pp_sizes[@]}"; do + for input_size in "${input_sizes[@]}"; do + for output_size in "${output_sizes[@]}"; do + for batch_size in "${batch_sizes[@]}"; do + max_seq_len_to_capture=$(expr $input_size \+ $output_size) + max_num_batched_tokens=$(expr $batch_size \* $input_size) + max_model_len=$max_seq_len_to_capture + if [ $max_model_len -gt $max_position_embeddings ]; then + continue + fi + # max_num_seqs=256 + # if [ $max_num_seqs -lt $batch_size ]; then + # max_num_seqs=$batch_size + # fi + max_num_seqs=$batch_size + if [ $max_model_len -gt $max_num_batched_tokens ]; then + max_num_batched_tokens=$max_model_len + fi + if [ $max_num_seqs -gt $max_num_batched_tokens ]; then + max_num_batched_tokens=$max_num_seqs + fi + + pp_option="--pipeline-parallel-size ${pp_size}" + tp_option="-tp ${tp_size}" + ep_option="--moe-ep-size ${moe_ep_size}" + batch_size_option="" + if [ $use_pp -le 0 ]; then + batch_size_option="--batch-size ${batch_size}" + fi + + hf_model_name=$(basename "${HF_MODEL}") + LOG_FILE=output/${hf_model_name}_${input_size}_${output_size}_tp_${tp_size}_moe_ep_${moe_ep_size}_pp_${pp_size}_bs_${batch_size}.log + echo "Executing ${hf_model_name} with tp_size=${tp_size}, moe_ep_size=${moe_ep_size}, pp_size=${pp_size}, input_size=${input_size}, output_size=${output_size}, batch_size=${batch_size}, max_model_len=${max_model_len}, max_num_batched_tokens=${max_num_batched_tokens}" + dltrace_data_name="dltrace_data_${hf_model_name}_${tp_size}_${moe_ep_size}_${pp_size}_${input_size}_${output_size}_${batch_size}_${max_model_len}_${max_num_batched_tokens}" + rm dltrace_data -rf + rm cnperf_data_* -rf + CNPERF_VLOG_LEVEL=0-40 cnperf-cli record ${record_option} python3 ${BENCHMARK_CMD} \ + --trust-remote-code \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 16 \ + --model ${HF_MODEL} \ + --tokenizer ${HF_MODEL} \ + --dtype bfloat16 \ + --input-len ${input_size} \ + --output-len ${output_size} \ + ${pp_option} ${tp_option} ${ep_option} \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --max-num-seqs ${max_num_seqs} \ + ${batch_size_option} \ + ${eager_option} ${ray_option} ${quantization_option} \ + 2>&1 | tee ${LOG_FILE} + # 检查日志文件中是否有 torch.OutOfMemoryError, Ceil of batch 或is larger than mlu blocks + if grep -E -q "torch\.OutOfMemoryError|Ceil of batch|is larger than mlu blocks" "$LOG_FILE"; then + echo "Found one or more specified errors in the log file." + break + else + echo "No specified errors found." + fi + mv dltrace_data ${dltrace_data_name} + mv cnperf_data_* ${dltrace_data_name}/ + done + done + done + done + done + done +done diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/client.sh b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/client.sh new file mode 100644 index 0000000..3ad237c --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/client.sh @@ -0,0 +1,34 @@ +#/bin/bash + +# export EXPERT_PARALLEL_EN=True +# export VLLM_LATENCY_DEBUG=True + +rm output/client -rf +mkdir -p output/client + +PORT=32345 +MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp" +input_sizes=(1024) +output_sizes=(1) +# batch_sizes=(1 2 4 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40) +batch_sizes=(32) +for input_size in "${input_sizes[@]}"; do + for output_size in "${output_sizes[@]}"; do + for batch_size in "${batch_sizes[@]}"; do + hf_model_name=$(basename "${HF_MODEL}") + LOG_FILE=output/client/${hf_model_name}_${input_size}_${output_size}_bs_${batch_size}.log + python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model ${MODEL_PATH} \ + --trust-remote-code \ + --dataset-name random \ + --num-prompts 1000 \ + --port ${PORT} \ + --request-rate inf \ + --random_input_len $input_size \ + --random-output-len ${output_size} \ + --max-concurrency ${batch_size} \ + 2>&1 | tee ${LOG_FILE} + done + done +done diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py new file mode 100644 index 0000000..2089b12 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/mlu_hijack.py @@ -0,0 +1,2 @@ +print("Apply Expert Parallel Demo!") +from . import model_executor \ No newline at end of file diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/__init__.py new file mode 100644 index 0000000..199a5fb --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/__init__.py @@ -0,0 +1,5 @@ +from .layers import sparse_moe_mlp +from .models import custom +from .models import mixtral +from .models import qwen2_moe +from .models import deepseek_v2 diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/__init__.py new file mode 100755 index 0000000..8b13789 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/__init__.py @@ -0,0 +1 @@ + diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py new file mode 100644 index 0000000..52b4158 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/layers/sparse_moe_mlp.py @@ -0,0 +1,142 @@ +""" +Inference-only MOE model. + +Tensor Parallel evenly splits each expert's weight and distributes them to different ranks, +which means each rank holds partial weight of all experts. +While Expert Parallel evenly distributes some of the experts' full weight to different ranks, +which means each rank holds part of the experts' full weight. + +As a result, each rank in the Tensor Parallel group receives all tokens' hidden states for all experts, +then computes using the partial weights, while for Expert Parallel, each rank only receives +part of tokens' hidden states for experts on this rank, then computes using the full weights. + +When both Tensor Parallel and Expert Parallel are enabled, each rank handles +a portion of the expert weights matrices (as in EP mode) and these weights are further sliced +across ranks (as in TP mode). This hybrid approach aims to balance the workload more evenly across ranks, +enhancing efficiency and reducing the likelihood of bottlenecks associated with EP mode alone. +""" + +from typing import Optional + +import torch +from torch import nn + +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tensor_model_parallel_group) +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, get_moe_tensor_parallel_group, + get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size, get_moe_expert_parallel_group) +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm_mlu.model_executor.layers.feed_forward import FeedForward +from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm_mlu._mlu_utils import get_device_major_capability + + +def vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__( + self, + num_experts: int, + top_k: int, + hidden_size: int, + intermediate_size: int, + up_proj_name: str, + is_gated: bool, + down_proj_name: str, + has_bias: bool, + skip_bias_add: bool = False, + renormalize:bool = False, + hidden_act: str = "silu", + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + is_use_fused_moe: bool = False, + expert_group: int = 1, + topk_group: int = 1, + ): + super(SparseMoeMlp, self).__init__() + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_group = get_tensor_model_parallel_group() + self.num_total_experts = num_experts + self.top_k = top_k + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.up_proj_name = up_proj_name + self.is_gated = is_gated + self.down_proj_name = down_proj_name + self.has_bias = has_bias + self.renormalize = renormalize + self.hidden_act = hidden_act + self.quant_config = quant_config + self.is_use_fused_moe = is_use_fused_moe + self.expert_group = expert_group + self.topk_group = topk_group + if get_device_major_capability() == 3: + self.is_use_fused_moe = False + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add moe relative distribution + ''' + self.moe_tp_size = get_moe_tensor_parallel_world_size() + self.moe_tp_rank = get_moe_tensor_parallel_rank() + self.moe_tp_group = get_moe_tensor_parallel_group() + self.moe_ep_size = get_moe_expert_parallel_world_size() + self.moe_ep_rank = get_moe_expert_parallel_rank() + self.moe_ep_group = get_moe_expert_parallel_group() + + # NOTE: The bias for fc2 is only applied on tp_rank 0. If we added it on all nodes the allreduce() would + # contain multiple copies of the bias. The bias on other node will be ignored, and may be set to nullptr + self.skip_bias_add = True if self.moe_tp_rank > 0 else False + + assert self.num_total_experts >= self.moe_ep_size, ( + f"need num_total_experts:{self.num_total_experts} >= moe_ep_size:{self.moe_ep_size}") + + assert self.intermediate_size % self.moe_tp_size == 0, ( + f"need intermediate_size:{self.intermediate_size} % moe_tp_size:{self.moe_tp_size} == 0") + + self.num_experts_per_rank = (self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size + if self.moe_ep_rank + 1 == self.moe_ep_size and self.num_total_experts % self.moe_ep_size: + self.num_experts_per_rank = self.num_total_experts % self.moe_ep_size + + self.start_expert_id = self.moe_ep_rank * ((self.num_total_experts + self.moe_ep_size - 1) // self.moe_ep_size) + ''' + ================== + End of MLU Hijack + ================== + ''' + self.end_expert_id = self.start_expert_id + self.num_experts_per_rank + + # Gate always runs at half / full precision for now. + self.gate = ReplicatedLinear(self.hidden_size, + self.num_total_experts, + bias=False, + params_dtype=self.params_dtype, + quant_config=None) + self.experts = nn.ModuleList([ + FeedForward(hidden_size=self.hidden_size, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + up_proj_name=self.up_proj_name, + is_gated=self.is_gated, + down_proj_name=self.down_proj_name, + bias=self.has_bias, + quant_config=self.quant_config, + skip_bias_add=self.skip_bias_add, + reduce_results=False, + tp_group=self.moe_tp_group) for idx in range(self.num_experts_per_rank) + ]) + + self.init_pack_param() + + +MluHijackObject.apply_hijack(SparseMoeMlp, + SparseMoeMlp.__init__, + vllm__mlu_hijack__model_executor__layers__feed_forward__SparseMoeMlp____init__) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py new file mode 100644 index 0000000..8d30bb0 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/custom.py @@ -0,0 +1,183 @@ +import torch +import torch.nn.functional as F +from typing import Optional +from vllm.config import CacheConfig +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm_mlu._mlu_utils import * +from vllm_mlu.model_executor.layers.feed_forward import FeedForward +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.distributed import tensor_model_parallel_all_reduce +from vllm_mlu.transformers_utils.configs import CustomConfig +from vllm_mlu.model_executor.custom_model.custom import CustomDecoderLayer, CustomAttention, _NORM_DICT +from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm_mlu.model_executor.models.layer_utils import ( + decoder_layer_forward_base, is_per_tensor_smoothquant, + is_per_token_smoothquant, quant_fusion_with_rmsnorm, + quant_fusion_with_layernorm) + + +class CustomMoeBlock(SparseMoeMlp): + + def __init__( + self, + config: CustomConfig, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__(num_experts=config.num_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + up_proj_name="gate_up_proj", + is_gated=config.is_gated, + down_proj_name="down_proj", + has_bias=config.mlp_bias, + skip_bias_add=False, + renormalize=config.norm_topk_prob, + hidden_act=config.hidden_act, + params_dtype=None, + quant_config=quant_config, + is_use_fused_moe=True) + + self.config = config + self.rank = self.tp_rank + self.shared_expert = None + self.shared_expert_gate = None + if config.shared_expert_intermediate_size > 0: + self.shared_expert = FeedForward(hidden_size=config.hidden_size, + intermediate_size=config.shared_expert_intermediate_size, + hidden_act=config.hidden_act, + up_proj_name='gate_up_proj', + is_gated=config.is_gated, + down_proj_name='down_proj', + bias=config.mlp_bias, + quant_config=quant_config, + reduce_results=False) + self.shared_expert_gate = ReplicatedLinear(config.hidden_size, + 1, + bias=False, + params_dtype=self.params_dtype, + quant_config=None) + + + def forward(self, hidden_states: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + shared_output = None + if self.shared_expert is not None: + shared_output = self.shared_expert(hidden_states) + if self.shared_expert_gate is not None: + gate_output = self.shared_expert_gate(hidden_states) + shared_output = F.sigmoid(gate_output[0]) * shared_output + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + residual_ = None if self.rank > 0 else residual + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify bt_ops.fused_moe to forward_experts + ''' + final_hidden_states = self.forward_experts(hidden_states, router_logits, residual) + ''' + ================== + End of MLU Hijack + ================== + ''' + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add comment to explain use_parallel_residual usage + ''' + # use_parallel_residual = True: x = x + attn(ln1(x)) + mlp(ln2(x)) + # use_parallel_residual = False: + # if apply_residual_connection_post_layernorm: + # x_attn = ln1(x) + attn(ln1(x)) + # x_mlp = ln2(x_attn) + mlp(ln2(x_attn)) + # else: + # x_attn = x + attn(ln1(x)) + # x_mlp = x_attn + mlp(ln2(x_attn)) + # When use_parallel_residual = True, x is shared between attn and mlp, so we only need to + # reduce after x + attn(ln1(x)) + mlp(ln2(x)) and don't need reduce here + # But when use_parallel_residual = False, mlp layer uses attn layer's output, so need reduce + # when mlp is finished. + ''' + ================== + End of MLU Hijack + ================== + ''' + reduce_results = (self.config.use_parallel_residual == False) + if reduce_results and self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +def vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__( + self, + config: CustomConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super(CustomDecoderLayer, self).__init__() + self.config = config + self.self_attn = CustomAttention( + config=config, + cache_config=cache_config, + quant_config=quant_config, + ) + + mlp_bias = getattr(config, "mlp_bias", False) or getattr(config, "bias", False) + is_gated = getattr(config, "is_gated", False) + + if config.num_experts is not None: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: nothing changed, only use the CustomMoeBlock class in this file + ''' + self.mlp = CustomMoeBlock(config=config, + quant_config=quant_config) + ''' + ================== + End of MLU Hijack + ================== + ''' + else: + self.mlp = FeedForward(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=self.config.hidden_act, + up_proj_name='up_proj', + is_gated=is_gated, + down_proj_name='down_proj', + bias=mlp_bias, + quant_config=quant_config, + skip_bias_add=(self.config.use_parallel_residual and mlp_bias), + reduce_results = (self.config.use_parallel_residual == False)) + + self.input_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps) + self.post_attention_layernorm = _NORM_DICT[self.config.norm_type](config.hidden_size, eps=config.norm_eps) + + # perf per-tensor sq cases by fusing quantization in layernorm + self.is_per_tesnor_sq_perf_cases = (is_per_tensor_smoothquant(quant_config) and + not self.config.apply_residual_connection_post_layernorm) + self.is_per_token_sq_perf_cases = (is_per_token_smoothquant(quant_config) and + not self.config.apply_residual_connection_post_layernorm) + if self.is_per_tesnor_sq_perf_cases or self.is_per_token_sq_perf_cases: + self.self_attn.qkv_proj.quant_method.skip_quant_input = True + self.quant_fusion_attn_layernorm = None + self.is_moe = config.num_experts is not None + self.use_rmsnorm = self.config.norm_type == "rmsnorm" + if not self.is_moe: + self.mlp.up_proj.quant_method.skip_quant_input = True + self.quant_fusion_mlp_layernorm = None + + +MluHijackObject.apply_hijack(CustomDecoderLayer, + CustomDecoderLayer.__init__, + vllm__mlu_hijack__model_executor__custom_model__custom__CustomDecoderLayer____init__) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py new file mode 100644 index 0000000..94e2618 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/deepseek_v2.py @@ -0,0 +1,222 @@ + +import re +import torch +from torch import nn +from typing import Any, Dict, Iterable, List, Optional, Tuple +from transformers import PretrainedConfig +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_mlu.model_executor.layers.feed_forward import FeedForward +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp +from vllm.model_executor.models.utils import is_pp_missing_parameter + +from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM +from vllm_mlu.model_executor.models.deepseek_v2 import DeepseekV2MoE +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size) + + +def vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super(DeepseekV2MoE, self).__init__(num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + up_proj_name="gate_up_proj", + is_gated=True, + down_proj_name="down_proj", + has_bias=False, + skip_bias_add=False, + renormalize=config.norm_topk_prob, + hidden_act=config.hidden_act, + params_dtype=None, + quant_config=quant_config, + is_use_fused_moe=True, + expert_group=config.n_group, + topk_group=config.topk_group) + self.config = config + self.routed_scaling_factor = config.routed_scaling_factor + self.n_shared_experts = config.n_shared_experts + self.routed_scaling_factor = config.routed_scaling_factor + if self.moe_tp_size > config.n_routed_experts: + raise ValueError( + f"Moe Tensor parallel size {self.moe_tp_size} is greater than " + f"the number of experts {config.n_routed_experts}.") + + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + + self.gate = ReplicatedLinear(config.hidden_size, + config.n_routed_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") + if config.n_shared_experts is not None: + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: replace MLP with FeedForward. + ''' + self.shared_experts = FeedForward(hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + up_proj_name='gate_up_proj', + is_gated=True, + down_proj_name='down_proj', + bias=False, + quant_config=quant_config, + reduce_results=False) + ''' + ================== + End of MLU Hijack + ================== + ''' + + +def vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: pack params and cal start expert id + ''' + for name, m in self.model.named_modules(): + if isinstance(m, SparseMoeMlp): + m.pack_params() + + # expert parallel modification start + moe_ep_rank = get_moe_expert_parallel_rank() + moe_ep_size = get_moe_expert_parallel_world_size() + num_total_experts = self.config.n_routed_experts + start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size) + # expert parallel modification end + ''' + ================== + End of MLU Hijack + ================== + ''' + + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: delete expert_params_mapping for no useless + ''' + ''' + ================== + End of MLU Hijack + ================== + ''' + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: replace expert_id in weight to named_expert_id in params_dict + ''' + if start_expert_id > 0 and "mlp.experts." in name: + expert_str = re.search(r'experts\.\d+', name).group(0) + expert_id=int(expert_str.split(".")[1]) + named_expert_id = expert_id - start_expert_id + old_expert_name = f"experts.{expert_id}" + new_expert_name = f"experts.{named_expert_id}" + name = name.replace(old_expert_name, new_expert_name) + ''' + ================== + End of MLU Hijack + ================== + ''' + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add expert skiped condition and delete useless if name not in params_dict: continue condition + ''' + name = name.replace(weight_name, param_name) + if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name) + and name not in params_dict): + continue + ''' + ================== + End of MLU Hijack + ================== + ''' + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add expert skiped condition + ''' + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if (("mlp.experts." in name or "mlp.shared_experts." in name or "mlp.shared_expert_gate." in name) + and name not in params_dict): + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + ''' + ================== + End of MLU Hijack + ================== + ''' + + +MluHijackObject.apply_hijack(DeepseekV2MoE, + DeepseekV2MoE.__init__, + vllm_mlu__model_executor__models__deepseek_v2__DeepseekV2MoE____init__) +MluHijackObject.apply_hijack(DeepseekV2ForCausalLM, + DeepseekV2ForCausalLM.load_weights, + vllm__module_executor__models__deepseek_v2__DeepseekV2ForCausalLM__load_weights) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py new file mode 100644 index 0000000..4a984ad --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/mixtral.py @@ -0,0 +1,143 @@ +import torch +import re +import vllm +from torch import nn +from typing import List, Optional, Tuple, Iterable +from vllm_mlu._mlu_utils import * +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.model_executor.models.mixtral import MixtralForCausalLM +from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size) +from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp +from vllm.model_executor.models.utils import is_pp_missing_parameter + + +def vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights( + self, + weights: Iterable[Tuple[str, torch.Tensor]]): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: pack params and cal start expert id + ''' + for name, m in self.model.named_modules(): + if isinstance(m, SparseMoeMlp): + m.pack_params() + # expert parallel modification start + moe_ep_rank = get_moe_expert_parallel_rank() + moe_ep_size = get_moe_expert_parallel_world_size() + num_total_experts = self.config.num_local_experts + start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size) + # expert parallel modification end + ''' + ================== + End of MLU Hijack + ================== + ''' + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("w13", "w1", 0), + ("w13", "w3", 1), + ] + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: delete expert_params_mapping for no useless + ''' + ''' + ================== + End of MLU Hijack + ================== + ''' + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: replace expert_id in weight to named_expert_id in params_dict + ''' + if start_expert_id > 0 and "block_sparse_moe.experts." in name: + expert_str = re.search(r'experts\.\d+', name).group(0) + expert_id=int(expert_str.split(".")[1]) + named_expert_id = expert_id - start_expert_id + old_expert_name = f"experts.{expert_id}" + new_expert_name = f"experts.{named_expert_id}" + name = name.replace(old_expert_name, new_expert_name) + ''' + ================== + End of MLU Hijack + ================== + ''' + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + if is_pp_missing_parameter(name, self): + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add expert skiped condition + ''' + # Skip experts that are not assigned to this worker. + if (("block_sparse_moe.experts." in name) and (name not in params_dict)): + continue + ''' + ================== + End of MLU Hijack + ================== + ''' + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add expert skiped condition + ''' + # Skip experts that are not assigned to this worker. + if (("block_sparse_moe.experts." in name) and (name not in params_dict)): + continue + ''' + ================== + End of MLU Hijack + ================== + ''' + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + +MluHijackObject.apply_hijack(MixtralForCausalLM, + MixtralForCausalLM.load_weights, + vllm__module_executor__models__mixtral__MixtralForCausalLM__load_weights) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py new file mode 100644 index 0000000..943c9ff --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/mlu_hijack/model_executor/models/qwen2_moe.py @@ -0,0 +1,179 @@ +import torch +import re +from typing import Optional, Iterable, Tuple +from vllm_mlu._mlu_utils import * +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import ( + get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size) +from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp +from vllm.utils import print_warning_once +from vllm.model_executor.models.utils import is_pp_missing_parameter + + +def vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights( + self, + weights: Iterable[Tuple[str, torch.Tensor]]): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: pack params and cal start expert id + ''' + for name, m in self.model.named_modules(): + if isinstance(m, SparseMoeMlp): + m.pack_params() + + # expert parallel modification start + moe_ep_rank = get_moe_expert_parallel_rank() + moe_ep_size = get_moe_expert_parallel_world_size() + num_total_experts = self.config.num_experts + start_expert_id = moe_ep_rank * ((num_total_experts + moe_ep_size - 1) // moe_ep_size) + # expert parallel modification end + ''' + ================== + End of MLU Hijack + ================== + ''' + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: delete expert_params_mapping for no useless + ''' + ''' + ================== + End of MLU Hijack + ================== + ''' + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: replace expert_id in weight to named_expert_id in params_dict + ''' + if start_expert_id > 0 and "mlp.experts." in name: + expert_str = re.search(r'experts\.\d+', name).group(0) + expert_id=int(expert_str.split(".")[1]) + named_expert_id = expert_id - start_expert_id + old_expert_name = f"experts.{expert_id}" + new_expert_name = f"experts.{named_expert_id}" + name = name.replace(old_expert_name, new_expert_name) + ''' + ================== + End of MLU Hijack + ================== + ''' + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: delete if "mlp.experts" in name: continue condition + ''' + ''' + ================== + End of MLU Hijack + ================== + ''' + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add expert skiped condition and delete useless if name not in params_dict: continue condition + ''' + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name) + and name not in params_dict): + continue + ''' + ================== + End of MLU Hijack + ================== + ''' + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: delete for mapping in expert_params_mapping condition + ''' + ''' + ================== + End of MLU Hijack + ================== + ''' + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + print_warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add expert skiped condition + ''' + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_expert." in name or "mlp.shared_expert_gate." in name) + and name not in params_dict): + continue + ''' + ================== + End of MLU Hijack + ================== + ''' + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + +MluHijackObject.apply_hijack(Qwen2MoeForCausalLM, + Qwen2MoeForCausalLM.load_weights, + vllm__module_executor__models__qwen2moe__Qwen2MoeForCausalLM__load_weights) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/offline_inference.py b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/offline_inference.py new file mode 100644 index 0000000..5105d5a --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/offline_inference.py @@ -0,0 +1,61 @@ +import os +os.environ['EXPERT_PARALLEL_EN'] = "True" + +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B" +tp_size = 2 +moe_ep_size=2 +is_check_act_range = True +input_seq_len=64 +output_seq_len=1 +batch=1 +# max_position_embedding=1024 +max_model_len=input_seq_len + output_seq_len +# if max_model_len < max_position_embedding: +# max_model_len = max_position_embedding +max_num_batched_tokens=input_seq_len * batch +if max_model_len > max_num_batched_tokens: + max_num_batched_tokens=max_model_len +max_num_seqs = batch + +if __name__ == '__main__': + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8) + + # Create an LLM. + llm = LLM(model=model_dir, + trust_remote_code=True, + enforce_eager=True, + dtype='bfloat16', + max_model_len=max_model_len, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs, + tensor_parallel_size=tp_size, + moe_ep_size=moe_ep_size, + ) + + if is_check_act_range: + llm.llm_engine.model_executor._run_workers("setup_smooth_hook", is_save_moe_info=True) + + llm.llm_engine.model_executor._run_workers("remove_hooks") + act_range = llm.llm_engine.model_executor._run_workers("get_act_range") + print(f"len(act_range)={len(act_range)}") + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/server.sh b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/server.sh new file mode 100644 index 0000000..d4dbebf --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/expert_parallel/server.sh @@ -0,0 +1,48 @@ +#/bin/bash + +rm output/server -rf +mkdir -p output/server + +PORT=32345 +use_ray=0 +use_pp=1 +use_eager=0 + +eager_option="" +if [ $use_eager -gt 0 ]; then + eager_option="--enforce-eager" +fi + +ray_option="" +if [ $use_ray -gt 0 ]; then + ray_option="--worker-use-ray" + ray stop --force +fi + +export VLLM_ENGINE_ITERATION_TIMEOUT_S=180 +MODEL_PATH="/data/vllm/sq_per_token_per_channel/deepseek_v2_temp" + +if [ $use_pp -gt 0 ]; then + parallel_option="--pipeline-parallel-size=8" +else + parallel_option="--tensor-parallel-size=8" +fi + +# TP8 +python -m vllm.entrypoints.openai.api_server \ + --disable-log-requests \ + --port ${PORT} \ + --model ${MODEL_PATH} \ + --trust-remote-code \ + --swap-space 16 \ + ${parallel_option} \ + --max-num-batched-tokens=40960 \ + --max-model-len=1034 \ + --block-size=16 \ + --dtype=bfloat16 \ + --max-seq-len-to-capture=1034 \ + --max-num-seqs=40 \ + --quantization=smoothquant \ + ${eager_option} \ + ${ray_option} \ + 2>&1 | tee output/server/server.log diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/tests/context_parallel/test_context_parallel.py b/vllm-v0.6.2/examples/cambricon_custom_func/tests/context_parallel/test_context_parallel.py new file mode 100644 index 0000000..ac1180c --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/tests/context_parallel/test_context_parallel.py @@ -0,0 +1,52 @@ +import torch +import sys +import ray +import gc +import contextlib +import os +os.environ['CONTEXT_PARALLEL_EN'] = "True" + +from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + +def cleanup(): + """Release occupied resources and reset parallel_state""" + from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel + destroy_model_parallel() + from vllm.distributed import destroy_distributed_environment + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + if not current_platform.is_cpu(): + torch.cuda.empty_cache() + + if ray.is_initialized(): + ray.shutdown() + +def run_vllm(prompts, sampling_params, tp, cp): + """Run LLM""" + llm = LLM(model="/data/AE/llm/models/Llama-2-7b-hf/", + enforce_eager=True, + tensor_parallel_size = tp, + context_parallel_size = cp, + distributed_executor_backend='ray') + outputs = llm.generate(prompts, sampling_params) + return outputs + +def test_context_parallel(): + """Compare the output results of cp1 and cp2""" + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, max_tokens=16) + outputs_1 = run_vllm(prompts, sampling_params, tp=1, cp=2) + cleanup() + outputs_2 = run_vllm(prompts, sampling_params, tp=1, cp=1) + cleanup() + generated_text_1 = [output.outputs[0].text for output in outputs_1] + generated_text_2 = [output.outputs[0].text for output in outputs_2] + assert generated_text_1 == generated_text_2 diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/tests/context_parallel/test_context_parallel_kv8.py b/vllm-v0.6.2/examples/cambricon_custom_func/tests/context_parallel/test_context_parallel_kv8.py new file mode 100644 index 0000000..7cb2bfb --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/tests/context_parallel/test_context_parallel_kv8.py @@ -0,0 +1,51 @@ +import torch +import sys +import ray +import gc +import contextlib +import os +os.environ['CONTEXT_PARALLEL_EN'] = "True" + +from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + +def cleanup(): + """Release occupied resources and reset parallel_state""" + from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel + destroy_model_parallel() + from vllm.distributed import destroy_distributed_environment + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + if not current_platform.is_cpu(): + torch.cuda.empty_cache() + + if ray.is_initialized(): + ray.shutdown() + +def run_vllm(prompts, sampling_params, tp, cp, use_kv8=False): + """Run LLM""" + kwargs = dict() + kwargs['model']="/data/AE/llm/models/Llama-2-7b-hf/" + kwargs['enforce_eager']=True, + kwargs['tensor_parallel_size'] = tp + kwargs['context_parallel_size'] = cp + kwargs['distributed_executor_backend']='ray' + kwargs['kv_cache_dtype'] = 'int8' + + llm = LLM(**kwargs) + outputs = llm.generate(prompts, sampling_params) + return outputs + +def test_context_parallel_with_kv8(): + """Compare the output results of cp1 and cp2 with kv cache int8.""" + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, max_tokens=16) + outputs_1 = run_vllm(prompts, sampling_params, tp=1, cp=2) + cleanup() diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/tests/expert_parallel/test_expert_parallel.py b/vllm-v0.6.2/examples/cambricon_custom_func/tests/expert_parallel/test_expert_parallel.py new file mode 100644 index 0000000..99ad5bd --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/tests/expert_parallel/test_expert_parallel.py @@ -0,0 +1,76 @@ +import torch +import sys +import ray +import gc +import contextlib +import numpy as np +import os +os.environ['EXPERT_PARALLEL_EN'] = "True" + +from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + +def string_list_to_float(text_list: list): + ''' + convert string list to float list + ''' + txt = np.array(text_list) + max_len = max(len(s) for s in txt) + string_to_float = lambda s: np.array([ord(char) for char in s.ljust(max_len)]) + txt_char = np.array([string_to_float(s) for s in txt]) + txt_float = txt_char.astype('float32') + return txt_float + +def compute_diff_text(baseline_text: list, compare_text: list): + ''' + compute the outputs diff1 and diff2 + ''' + baseline = string_list_to_float(baseline_text) + compare = string_list_to_float(compare_text) + error = np.abs(baseline - compare) + diff1 = np.sum(error) / np.sum(np.abs(baseline)) + diff2 = np.sqrt(np.sum(error**2)/np.sum(baseline**2)) + return diff1, diff2 + +def cleanup(): + '''Release occupied resources and reset parallel_state''' + from examples.cambricon_custom_func.vllm.mlu_hijack.distributed.parallel_state import destroy_model_parallel + destroy_model_parallel() + from vllm.distributed import destroy_distributed_environment + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + if not current_platform.is_cpu(): + torch.cuda.empty_cache() + + if ray.is_initialized(): + ray.shutdown() + +def run_vllm(prompts, sampling_params, tp, mtp=-1, mep=-1, model_dir="/data/AE/llm/models/Qwen1.5-MoE-A2.7B/"): + '''Run LLM''' + llm = LLM(model=model_dir, + enforce_eager=True, + tensor_parallel_size=tp, + moe_tp_size=mtp, + moe_ep_size=mep) + outputs = llm.generate(prompts, sampling_params) + return outputs + +def test_expert_parallel(): + """Compare the output results of tp4 and mtp=1, 2""" + qwen2_moe_model_dir = "/data/AE/llm/models/Qwen1.5-MoE-A2.7B" + eps = 1e-6 + prompts = [ + "Hello, my name is", + ] + sampling_params = SamplingParams(temperature=0.8, max_tokens=1) + outputs_1 = run_vllm(prompts, sampling_params, tp=2, mtp=1, model_dir=qwen2_moe_model_dir) + cleanup() + outputs_2 = run_vllm(prompts, sampling_params, tp=2, mtp=2, model_dir=qwen2_moe_model_dir) + cleanup() + generated_text_1 = [output.outputs[0].text for output in outputs_1] + generated_text_2 = [output.outputs[0].text for output in outputs_2] + diff1, diff2 = compute_diff_text(generated_text_1, generated_text_2) + assert diff1 <= eps and diff2 <= eps, ( + f"qwen2_moe generated_1({generated_text_1}) and generated_2{generated_text_2} diff error") diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/common.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/common.py new file mode 100644 index 0000000..d47ad40 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/common.py @@ -0,0 +1,17 @@ +import logging +from logging import Logger + +def init_logger(name: str) -> Logger: + """Initialize loggers for benchmarks module, + and keep the configuration consistent with the vllm module""" + + logger = logging.getLogger(name) + + vllm_logger = logging.Logger.manager.loggerDict.get('vllm', None) + if vllm_logger: + logger.setLevel(vllm_logger.level) + logger.propagate = vllm_logger.propagate + logger.handlers = vllm_logger.handlers + + return logger + diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py new file mode 100644 index 0000000..c416afd --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/config.py @@ -0,0 +1,110 @@ +import torch +from vllm.config import ParallelConfig, TokenizerPoolConfig +from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union +from vllm.logger import init_logger +from vllm.utils import cuda_device_count_stateless +from vllm.platforms import current_platform +from vllm_mlu.mlu_hijack_utils import MluHijackObject +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + + from vllm.executor.executor_base import ExecutorBase + +logger = init_logger(__name__) + + +def vllm__config__ParallelConfig___init__( + self, + pipeline_parallel_size: int, + tensor_parallel_size: int, + worker_use_ray: Optional[bool] = None, + max_parallel_loading_workers: Optional[int] = None, + disable_custom_all_reduce: bool = False, + tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, + ray_workers_use_nsight: bool = False, + placement_group: Optional["PlacementGroup"] = None, + distributed_executor_backend: Optional[Union[ + str, Type["ExecutorBase"]]] = None, +) -> None: + self.pipeline_parallel_size = pipeline_parallel_size + self.tensor_parallel_size = tensor_parallel_size + self.distributed_executor_backend = distributed_executor_backend + self.max_parallel_loading_workers = max_parallel_loading_workers + self.disable_custom_all_reduce = disable_custom_all_reduce + self.tokenizer_pool_config = tokenizer_pool_config + self.ray_workers_use_nsight = ray_workers_use_nsight + self.placement_group = placement_group + + ''' + ========================== + Modify by vllm_mlu + ========================== + @brief: modify world_size + ''' + self.context_parallel_size = self.context_parallel_size + self.moe_tp_size = self.moe_tp_size + self.moe_ep_size = self.moe_ep_size + + self.world_size = pipeline_parallel_size * tensor_parallel_size * self.context_parallel_size + ''' + ======================= + End of MLU Hijack + ======================= + ''' + if worker_use_ray: + if self.distributed_executor_backend is None: + self.distributed_executor_backend = "ray" + elif not self.use_ray: + raise ValueError(f"worker-use-ray can't be used with " + f"distributed executor backend " + f"'{self.distributed_executor_backend}'.") + + if current_platform.is_tpu() and self.world_size > 1: + if self.distributed_executor_backend is None: + self.distributed_executor_backend = "ray" + if self.distributed_executor_backend != "ray": + raise ValueError( + "TPU backend only supports Ray for distributed inference.") + + if current_platform.is_hpu() and self.world_size > 1: + if self.distributed_executor_backend is None: + self.distributed_executor_backend = "ray" + if self.distributed_executor_backend != "ray": + raise ValueError( + "HPU backend only supports Ray for distributed inference.") + + if self.distributed_executor_backend is None and self.world_size > 1: + # We use multiprocessing by default if world_size fits on the + # current node and we aren't in a ray placement group. + + from vllm.executor import ray_utils + backend = "mp" + ray_found = ray_utils.ray_is_available() + if (current_platform.is_cuda() + and cuda_device_count_stateless() < self.world_size): + if not ray_found: + raise ValueError("Unable to load Ray which is " + "required for multi-node inference, " + "please install Ray with `pip install " + "ray`.") from ray_utils.ray_import_err + backend = "ray" + elif ray_found: + if self.placement_group: + backend = "ray" + else: + from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): + from ray.util import get_current_placement_group + if get_current_placement_group(): + backend = "ray" + self.distributed_executor_backend = backend + logger.info("Defaulting to use %s for distributed inference", + backend) + + self._verify_args() + self.rank: int = 0 + + +MluHijackObject.apply_hijack(ParallelConfig, + ParallelConfig.__init__, + vllm__config__ParallelConfig___init__) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/__init__.py new file mode 100644 index 0000000..0f2b0e3 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/__init__.py @@ -0,0 +1,2 @@ +from . import communication_op +from . import parallel_state diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/communication_op.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/communication_op.py new file mode 100644 index 0000000..4c1e24f --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/communication_op.py @@ -0,0 +1,21 @@ +import torch +from typing import Any, Dict, Optional, Union + +from .parallel_state import get_tp_group + +def tensor_model_parallel_all_reduce(input_: torch.Tensor, tp_group: Any = None) -> torch.Tensor: + """All-reduce the input tensor across model parallel group.""" + return get_tp_group(tp_group).all_reduce(input_) + + +def tensor_model_parallel_all_gather(input_: torch.Tensor, + dim: int = -1, tp_group: Any = None) -> torch.Tensor: + """All-gather the input tensor across model parallel group.""" + return get_tp_group(tp_group).all_gather(input_, dim) + + +def tensor_model_parallel_gather(input_: torch.Tensor, + dst: int = 0, + dim: int = -1, tp_group: Any = None) -> Optional[torch.Tensor]: + """Gather the input tensor across model parallel group.""" + return get_tp_group(tp_group).gather(input_, dst, dim) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/parallel_state.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/parallel_state.py new file mode 100644 index 0000000..ce8780b --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/distributed/parallel_state.py @@ -0,0 +1,339 @@ +import torch +from typing import Any, Dict, List, Optional, Tuple, Union + +from vllm.config import ParallelConfig +from vllm.distributed.parallel_state import (init_model_parallel_group, get_tensor_model_parallel_world_size, + get_tensor_model_parallel_rank, get_world_group, get_pp_group, + GroupCoordinator) +import vllm.distributed.parallel_state as parallel_state_org +from vllm.distributed.parallel_state import model_parallel_is_initialized as model_parallel_is_initialized_org +from vllm.distributed.parallel_state import destroy_model_parallel as destroy_model_parallel_org + +def get_tp_group(tp_group: Any = None) -> GroupCoordinator: + if tp_group is not None: + return tp_group + assert parallel_state_org._TP is not None, ("tensor model parallel group is not initialized") + return parallel_state_org._TP + +_CP: Optional[GroupCoordinator] = None + +def get_cp_group() -> GroupCoordinator: + assert _CP is not None, ("context parallel group is not initialized") + return _CP + +# kept for backward compatibility +get_context_model_parallel_group = get_cp_group + +_MOE_TP: Optional[GroupCoordinator] = None + +def get_moe_tp_group() -> GroupCoordinator: + assert _MOE_TP is not None, ("moe tensor parallel group is not initialized") + return _MOE_TP + +# kept for backward compatibility +get_moe_tensor_parallel_group = get_moe_tp_group + +_MOE_EP: Optional[GroupCoordinator] = None + +def get_moe_ep_group() -> GroupCoordinator: + assert _MOE_EP is not None, ("moe expert parallel group is not initialized") + return _MOE_EP + + +# kept for backward compatibility +get_moe_expert_parallel_group = get_moe_ep_group + + +def initialize_model_parallel( + parallel_config: ParallelConfig, + backend: Optional[str] = None, +) -> None: + """ + Initialize model parallel groups. + + Arguments: + tensor_model_parallel_size: number of GPUs used for tensor model + parallelism. + pipeline_model_parallel_size: number of GPUs used for pipeline model + parallelism. + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: + 4 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 pipeline model-parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size: int = torch.distributed.get_world_size() + backend = backend or torch.distributed.get_backend( + get_world_group().device_group) + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: get parallel_size from parallel_config and valid world_size + ''' + tensor_model_parallel_size = parallel_config.tensor_parallel_size + pipeline_model_parallel_size = parallel_config.pipeline_parallel_size + context_model_parallel_size = parallel_config.context_parallel_size + moe_tensor_parallel_size = parallel_config.moe_tp_size + moe_expert_parallel_size = parallel_config.moe_ep_size + + if (world_size != + tensor_model_parallel_size * pipeline_model_parallel_size * context_model_parallel_size): + raise RuntimeError( + f"world_size ({world_size}) is not equal to " + f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " + f"pipeline_model_parallel_size ({pipeline_model_parallel_size}) x" + f"context_model_parallel_size ({context_model_parallel_size})") + + if (moe_tensor_parallel_size < 1 or moe_expert_parallel_size < 1 or tensor_model_parallel_size != + moe_tensor_parallel_size * moe_expert_parallel_size): + raise RuntimeError( + f"tensor_model_parallel_size ({world_size}) is not equal to " + f"moe_tensor_parallel_size ({moe_tensor_parallel_size}) x " + f"moe_expert_parallel_size ({moe_expert_parallel_size})") + ''' + ================== + End of MLU Hijack + ================== + ''' + + # Build the tensor model-parallel groups. + num_tensor_model_parallel_groups: int = (world_size // + tensor_model_parallel_size) + assert parallel_state_org._TP is None, ("tensor model parallel group is already initialized") + group_ranks = [] + for i in range(num_tensor_model_parallel_groups): + ranks = list( + range(i * tensor_model_parallel_size, + (i + 1) * tensor_model_parallel_size)) + group_ranks.append(ranks) + + # message queue broadcaster is only used in tensor model parallel group + parallel_state_org._TP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="tp") + + # Build the pipeline model-parallel groups. + num_pipeline_model_parallel_groups: int = (world_size // + pipeline_model_parallel_size) + assert parallel_state_org._PP is None, ( + "pipeline model parallel group is already initialized") + group_ranks = [] + for i in range(num_pipeline_model_parallel_groups): + ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) + group_ranks.append(ranks) + # pipeline parallel does not need custom allreduce + parallel_state_org._PP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + use_custom_allreduce=False, + group_name="pp") + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add _CP, _MOE_TP, MOE_EP + ''' + # Build the context parallel groups. + num_context_model_parallel_groups: int = (world_size // + context_model_parallel_size) + global _CP + assert _CP is None, ( + "context parallel group is already initialized") + group_ranks = [] + for i in range(num_context_model_parallel_groups): + ranks = list(range(i, context_model_parallel_size * tensor_model_parallel_size + i, tensor_model_parallel_size)) + group_ranks.append(ranks) + # message queue broadcaster is set to be used in context parallel group + _CP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="cp") + + # Build the moe tensor parallel groups. + global _MOE_TP + assert _MOE_TP is None, ("moe tensor parallel group is already initialized") + group_ranks = [] + for i in range(num_tensor_model_parallel_groups): + for j in range(moe_expert_parallel_size): + ranks = list(range(i * tensor_model_parallel_size + j, (i + 1) * tensor_model_parallel_size, + moe_expert_parallel_size)) + group_ranks.append(ranks) + + # message queue broadcaster is set to be used in moe tensor parallel group + _MOE_TP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="moe_tp") + + # Build the moe expert parallel groups. + global _MOE_EP + assert _MOE_EP is None, ("moe expert parallel group is already initialized") + group_ranks = [] + for i in range(num_tensor_model_parallel_groups): + for j in range(moe_tensor_parallel_size): + ranks = range(i * tensor_model_parallel_size + j * moe_expert_parallel_size, + i * tensor_model_parallel_size + (j + 1) * moe_expert_parallel_size) + group_ranks.append(ranks) + + # message queue broadcaster is set to be used in moe expert parallel group + _MOE_EP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="moe_ep") + ''' + ================== + End of MLU Hijack + ================== + ''' + + +def ensure_model_parallel_initialized( + parallel_config: ParallelConfig, + backend: Optional[str] = None, +) -> None: + """Helper to initialize model parallel groups if they are not initialized, + or ensure tensor-parallel and pipeline-parallel sizes are equal to expected + values if the model parallel groups are initialized. + """ + backend = backend or torch.distributed.get_backend( + get_world_group().device_group) + if not model_parallel_is_initialized(): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: replace all parallel_size to parallel_config + ''' + initialize_model_parallel(parallel_config, backend) + ''' + ================== + End of MLU Hijack + ================== + ''' + return + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: check parallel_size with prefix parallel_config + ''' + assert ( + get_tensor_model_parallel_world_size() == parallel_config.tensor_model_parallel_size + ), ("tensor parallel group already initialized, but of unexpected size: " + f"{get_tensor_model_parallel_world_size()=} vs. " + f"{parallel_config.tensor_model_parallel_size=}") + pp_world_size = get_pp_group().world_size + assert (pp_world_size == parallel_config.pipeline_model_parallel_size), ( + "pipeline parallel group already initialized, but of unexpected size: " + f"{pp_world_size=} vs. " + f"{parallel_config.pipeline_model_parallel_size=}") + cp_world_size = get_cp_group().world_size + assert (cp_world_size == parallel_config.context_parallel_size), ( + "context parallel group already initialized, but of unexpected size: " + f"{cp_world_size=} vs. " + f"{parallel_config.context_parallel_size=}") + moe_tp_world_size = get_moe_tp_group().world_size + assert (moe_tp_world_size == parallel_config.moe_tp_size), ( + "moe tensor parallel group already initialized, but of unexpected size: " + f"{moe_tp_world_size=} vs. " + f"{parallel_config.moe_tp_size=}") + moe_ep_world_size = get_moe_ep_group().world_size + assert (moe_ep_world_size == parallel_config.moe_ep_size), ( + "moe expert parallel group already initialized, but of unexpected size: " + f"{moe_ep_world_size=} vs. " + f"{parallel_config.moe_ep_size=}") + ''' + ================== + End of MLU Hijack + ================== + ''' + + +def model_parallel_is_initialized(): + """Check if tensor, pipeline, context, moe parallel groups are initialized.""" + return model_parallel_is_initialized_org and (_CP is not None and _CP is not None) and ( + _MOE_TP is not None and _MOE_TP is not None) and (_MOE_EP is not None and _MOE_EP is not None) + + +def destroy_model_parallel(): + """Set the groups to none and destroy them.""" + destroy_model_parallel_org() + global _CP + if _CP: + _CP.destroy() + _CP = None + + global _MOE_TP + if _MOE_TP: + _MOE_TP.destroy() + _MOE_TP = None + + global _MOE_EP + if _MOE_EP: + _MOE_EP.destroy() + _MOE_EP = None + + +def get_context_model_parallel_world_size(): + """Return world size for the context parallel group.""" + return get_cp_group().world_size + + +def get_context_model_parallel_rank(): + """Return my rank for the context parallel group.""" + return get_cp_group().rank_in_group + + +def get_moe_tensor_parallel_world_size(): + """Return world size for the moe tensor parallel group.""" + return get_moe_tp_group().world_size + + +def get_moe_tensor_parallel_rank(): + """Return my rank for the moe tensor parallel group.""" + return get_moe_tp_group().rank_in_group + + +def get_moe_expert_parallel_world_size(): + """Return world size for the moe expert parallel group.""" + return get_moe_ep_group().world_size + + +def get_moe_expert_parallel_rank(): + """Return my rank for the moe expert parallel group.""" + return get_moe_ep_group().rank_in_group + + +def get_parallel_world_size_with_group(group): + """Return world size for the special group.""" + if group is not None: + return group.world_size + else: + return get_tensor_model_parallel_world_size() + + +def get_parallel_rank_with_group(group): + """Return my rank for the special group.""" + if group is not None: + return group.rank_in_group + else: + return get_tensor_model_parallel_rank() diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/__init__.py new file mode 100644 index 0000000..cafa4e6 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/__init__.py @@ -0,0 +1 @@ +from . import arg_utils diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/arg_utils.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/arg_utils.py new file mode 100644 index 0000000..d67bb56 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/engine/arg_utils.py @@ -0,0 +1,141 @@ +import argparse +import torch +from vllm.config import VllmConfig, ParallelConfig +from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs +from vllm_mlu._mlu_utils import * +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser + +logger = init_logger(__name__) + + +vllm__engine__arg_utils__EngineArgs__create_engine_config_org = EngineArgs.create_engine_config +vllm__engine__arg_utils__EngineArgs__add_cli_args_org = EngineArgs.add_cli_args +vllm__engine__arg_utils__EngineArgs__from_cli_args_org = EngineArgs.from_cli_args +vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org = AsyncEngineArgs.from_cli_args + + +def vllm__engine__arg_utils__EngineArgs__create_engine_config(self, ) -> VllmConfig: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: chunked parallel pipeline only support batch size = 1 yet. + ''' + if CHUNKED_PIPELINE_PARALLEL_EN: + self.max_num_seqs = 1 + logger.info("Reset max_num_seqs to 1 as the chunked parallel pipeline mode " + "only supports batch size to 1.") + ''' + @brief: disable custom_all_reduce, re-set block_size to support paged and unpaged mode. + ''' + # MLU not support custom all reduce + self.disable_custom_all_reduce = True + BlockSizeInfo.set_block_size(self.block_size) + if not USE_PAGED and self.enable_chunked_prefill: + raise ValueError("Not support chunked_prefill in unpaged mode.") + + # set parallel_config context_parallel_size, moe_tp_size, moe_ep_size + self.context_parallel_size = getattr(self, "context_parallel_size", 1) + self.moe_tp_size = getattr(self, "moe_tp_size", -1) + self.moe_ep_size = getattr(self, "moe_ep_size", -1) + # check context parallel whether supported or not + if CONTEXT_PARALLEL_EN: + if self.context_parallel_size > 1 and get_device_major_capability() == 3: + raise ValueError('Context parallel does not support MLU370.') + else: + if self.context_parallel_size > 1: + raise ValueError('Context parallel does not support when CONTEXT_PARALLEL_EN=False') + # check expert parallel whether supported or not + if not EXPERT_PARALLEL_EN and (self.moe_tp_size > 1 or self.moe_ep_size > 1): + raise ValueError('Expert parallel does not support when EXPERT_PARALLEL_EN=False') + + ParallelConfig.context_parallel_size = self.context_parallel_size + + # set parallel_config moe_tp_size and moe_ep_size + if self.moe_tp_size < 1 and self.moe_ep_size < 1: + moe_tp_size = self.tensor_parallel_size + moe_ep_size = 1 + elif self.moe_tp_size >= 1 and self.moe_ep_size < 1: + moe_tp_size = self.moe_tp_size + moe_ep_size = self.tensor_parallel_size // self.moe_tp_size + elif self.moe_tp_size < 1 and self.moe_ep_size >= 1: + moe_tp_size = self.tensor_parallel_size // self.moe_ep_size + moe_ep_size = self.moe_ep_size + else: + moe_tp_size = self.moe_tp_size + moe_ep_size = self.moe_ep_size + assert moe_tp_size * moe_ep_size == self.tensor_parallel_size, ( + f"tensor_parallel_size ({self.tensor_parallel_size}) is not equal to " + f"moe_tp_size ({self.moe_tp_size}) x moe_ep_size ({self.moe_ep_size})" + "or moe_tp_size and moe_ep_size should be -1 or one of them should be -1") + + ParallelConfig.moe_tp_size = moe_tp_size + ParallelConfig.moe_ep_size = moe_ep_size + + engine_config = vllm__engine__arg_utils__EngineArgs__create_engine_config_org(self) + engine_config.cache_config.block_size = BlockSizeInfo.BLOCK_SIZE + ''' + ================== + End of MLU Hijack + ================== + ''' + return engine_config + + +@staticmethod +def vllm__engine__arg_utils__EngineArgs__add_cli_args( + parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + parser = vllm__engine__arg_utils__EngineArgs__add_cli_args_org(parser) + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add --context-parallel-size, --moe-tp-size and --moe-ep-size + ''' + parser.add_argument('--context-parallel-size', + '-cp', + type=int, + default=1, + help='number of context parallel replicas') + parser.add_argument('--moe-tp-size', + type=int, + default=-1, + help='Number of moe tensor parallel replicas') + parser.add_argument('--moe-ep-size', + type=int, + default=-1, + help='Number of moe expert parallel replicas') + ''' + ================== + End of MLU Hijack + ================== + ''' + return parser + + +@classmethod +def vllm__engine__arg_utils__EngineArgs__from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs': + if cls == AsyncEngineArgs: + engine_args = vllm__engine__arg_utils__AsyncEngineArgs__from_cli_args_org(args) + else: + engine_args = vllm__engine__arg_utils__EngineArgs__from_cli_args_org(args) + setattr(engine_args, 'context_parallel_size', getattr(args, "context_parallel_size")) + setattr(engine_args, 'moe_tp_size', getattr(args, "moe_tp_size")) + setattr(engine_args, 'moe_ep_size', getattr(args, "moe_ep_size")) + return engine_args + + +MluHijackObject.apply_hijack(EngineArgs, + EngineArgs.create_engine_config, + vllm__engine__arg_utils__EngineArgs__create_engine_config) +MluHijackObject.apply_hijack(EngineArgs, + EngineArgs.add_cli_args, + vllm__engine__arg_utils__EngineArgs__add_cli_args) +MluHijackObject.apply_hijack(EngineArgs, + EngineArgs.from_cli_args, + vllm__engine__arg_utils__EngineArgs__from_cli_args) +MluHijackObject.apply_hijack(AsyncEngineArgs, + AsyncEngineArgs.from_cli_args, + vllm__engine__arg_utils__EngineArgs__from_cli_args) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/__init__.py new file mode 100644 index 0000000..9716642 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/__init__.py @@ -0,0 +1 @@ +from . import llm diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/llm.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/llm.py new file mode 100644 index 0000000..a7692f5 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/entrypoints/llm.py @@ -0,0 +1,98 @@ +from typing import Optional, Dict, Any +from vllm.entrypoints.llm import LLM +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.logger import init_logger +from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, + TaskOption) + + +logger = init_logger(__name__) + + +vllm__entrypoints__llm__LLM____init__org = LLM.__init__ + +def vllm__entrypoints__llm__LLM____init__( + self, + model: str, + tokenizer: Optional[str] = None, + tokenizer_mode: str = "auto", + skip_tokenizer_init: bool = False, + trust_remote_code: bool = False, + allowed_local_media_path: str = "", + tensor_parallel_size: int = 1, + dtype: str = "auto", + quantization: Optional[str] = None, + revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + seed: int = 0, + gpu_memory_utilization: float = 0.9, + swap_space: float = 4, + cpu_offload_gb: float = 0, + enforce_eager: Optional[bool] = None, + max_seq_len_to_capture: int = 8192, + disable_custom_all_reduce: bool = False, + disable_async_output_proc: bool = False, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + # After positional args are removed, move this right below `model` + task: TaskOption = "auto", + override_pooler_config: Optional[PoolerConfig] = None, + **kwargs, +) -> None: + ''' + LLM constructor. + + Note: if enforce_eager is unset (enforce_eager is None) + it defaults to False. + ''' + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add cp and ep parameter + ''' + # pop context_parallel_size + EngineArgs.context_parallel_size = kwargs.pop("context_parallel_size", 1) + # pop moe_tp_size and moe_ep_size + EngineArgs.moe_tp_size = kwargs.pop("moe_tp_size", -1) + # pop moe_ep_size and moe_ep_size + EngineArgs.moe_ep_size = kwargs.pop("moe_ep_size", -1) + ''' + ================== + End of MLU Hijack + ================== + ''' + vllm__entrypoints__llm__LLM____init__org( + self=self, + model=model, + tokenizer=tokenizer, + tokenizer_mode=tokenizer_mode, + skip_tokenizer_init=skip_tokenizer_init, + trust_remote_code=trust_remote_code, + allowed_local_media_path=allowed_local_media_path, + tensor_parallel_size=tensor_parallel_size, + dtype=dtype, + quantization=quantization, + revision=revision, + tokenizer_revision=tokenizer_revision, + seed=seed, + gpu_memory_utilization=gpu_memory_utilization, + swap_space=swap_space, + cpu_offload_gb=cpu_offload_gb, + enforce_eager=enforce_eager, + max_seq_len_to_capture=max_seq_len_to_capture, + disable_custom_all_reduce=disable_custom_all_reduce, + disable_async_output_proc=disable_async_output_proc, + hf_overrides=hf_overrides, + mm_processor_kwargs=mm_processor_kwargs, + # After positional args are removed, move this right below `model` + task=task, + override_pooler_config=override_pooler_config, + **kwargs + ) + + +MluHijackObject.apply_hijack(LLM, + LLM.__init__, + vllm__entrypoints__llm__LLM____init__) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/mlu_hijack.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/mlu_hijack.py new file mode 100644 index 0000000..f1f0a15 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/mlu_hijack.py @@ -0,0 +1,7 @@ +print("Apply Custom VLLM Demo!") +from . import distributed +from . import engine +from . import entrypoints +from . import worker +from . import config +from . import model_executor diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/__init__.py new file mode 100644 index 0000000..9bc259d --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/__init__.py @@ -0,0 +1,2 @@ +from . import layers +from . import parameter diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/__init__.py new file mode 100644 index 0000000..a02d236 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/__init__.py @@ -0,0 +1,2 @@ +from . import linear +from . import feed_forward diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/feed_forward.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/feed_forward.py new file mode 100755 index 0000000..c793c57 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/feed_forward.py @@ -0,0 +1,93 @@ +from typing import Optional, Any +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + ColumnParallelLinear, + RowParallelLinear +) +from vllm_mlu.mlu_hijack_utils import set_is_gated, MluHijackObject +from vllm_mlu.model_executor.layers.feed_forward import FeedForward +from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group) + + +logger = init_logger(__name__) + + +def vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + up_proj_name: str, + is_gated: bool, + down_proj_name: str, + bias: bool, + quant_config: Optional[QuantizationConfig] = None, + skip_bias_add: bool = False, + reduce_results: bool = True, + prefix: str = "", + tp_group: Any = None, + ): + super(FeedForward, self).__init__() + self.hidden_size = hidden_size + self.hidden_act = hidden_act + self.is_gated = is_gated + self.bias = bias + self.up_proj_name = up_proj_name + self.down_proj_name = down_proj_name + self.quant_config = quant_config + self.is_initialized = False + self.skip_bias_add = skip_bias_add + self.reduce_results = reduce_results + self.use_bt_ffn = True if quant_config is None else False + set_is_gated(self.is_gated) + self.tp_size = get_parallel_world_size_with_group(tp_group) + self.tp_rank = get_parallel_rank_with_group(tp_group) + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add tp_group parameter at the end of each linear class + ''' + self.tp_group = tp_group + # up_proj with gate or not + if self.is_gated: + up_proj = MergedColumnParallelLinear(hidden_size, + [intermediate_size] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.{up_proj_name}", + tp_group=tp_group) + else: + up_proj = ColumnParallelLinear(hidden_size, + intermediate_size, + bias=bias, + skip_bias_add=skip_bias_add, + quant_config=quant_config, + prefix=f"{prefix}.{up_proj_name}", + tp_group=tp_group) + self.register_module(up_proj_name, up_proj) + + # down_proj + down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=bias, + skip_bias_add=skip_bias_add, + reduce_results=reduce_results, + quant_config=quant_config, + prefix=f"{prefix}.{down_proj_name}", + tp_group=tp_group) + ''' + ================== + End of MLU Hijack + ================== + ''' + self.register_module(down_proj_name, down_proj) + + +MluHijackObject.apply_hijack(FeedForward, + FeedForward.__init__, + vllm__mlu_hijack__model_executor__layers__feed_forward__FeedForward____init__) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/linear.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/linear.py new file mode 100644 index 0000000..b426cff --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/layers/linear.py @@ -0,0 +1,696 @@ +from typing import Optional, List, Any, Tuple +import torch +from torch.nn.parameter import Parameter, UninitializedParameter + +from vllm.distributed import (divide, split_tensor_along_last_dim) +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.parameter import (BasevLLMParameter, + PerTensorScaleParameter, + RowvLLMParameter) + +from vllm.logger import init_logger +from vllm_mlu._mlu_utils import * +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, LinearBase, ColumnParallelLinear, + MergedColumnParallelLinear, RowParallelLinear, adjust_marlin_shard, + adjust_scalar_to_fused_array) +from vllm import _mlu_ops as mlu_ops +from ....mlu_hijack.distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group, + get_tp_group) +from ....mlu_hijack.distributed.communication_op import (tensor_model_parallel_all_reduce, + tensor_model_parallel_all_gather) + +vllm__model_executor__layers__linear__LinearBase____init__org = LinearBase.__init__ + +logger = init_logger(__name__) + + +def vllm__model_executor__layers__linear__LinearBase____init__( + self, + input_size: int, + output_size: int, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + tp_group: Any = None, + ): + vllm__model_executor__layers__linear__LinearBase____init__org(self=self, + input_size=input_size, + output_size=output_size, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix) + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add self.tp_group, world_size and tp_rank to support moe expert parallel + ''' + self.tp_group = tp_group + self.tp_world_size = get_parallel_world_size_with_group(self.tp_group) + self.tp_rank = get_parallel_rank_with_group(self.tp_group) + ''' + ================= + End of MLU Hijack + ================= + ''' + + +def vllm__model_executor__layers__linear__ColumnParallelLinear____init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + output_sizes: Optional[List[int]] = None, + prefix: str = "", + tp_group: Any = None, + ): + super(ColumnParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype, + quant_config, prefix, tp_group) + + self.gather_output = gather_output + + # Divide the weight matrix along the last dimension. + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + @brief: move checking output_sizes logic from MergedColumnParallelLinear to here + ''' + tp_size = self.tp_world_size + + if output_sizes is not None: + assert all(output_size_var % tp_size == 0 for output_size_var in output_sizes) + ''' + ================= + End of MLU Hijack + ================= + ''' + assert self.quant_method is not None + self.output_size_per_partition = divide(self.output_size, tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, tp_size) + for output_size in self.output_sizes + ] + + if output_sizes is None: + output_sizes = [output_size] + + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + +def vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader( + self, param: Parameter, loaded_weight: torch.Tensor): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + ''' + tp_rank = self.tp_rank + ''' + ================= + End of MLU Hijack + ================= + ''' + output_dim = getattr(param, "output_dim", None) + + # Special case for GGUF + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) + + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + + param_data = param.data + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if output_dim is not None and not use_bitsandbytes_4bit: + shard_size = param_data.shape[output_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +def vllm__model_executor__layers__linear__ColumnParallelLinear__forward( + self, input_, smooth_quant_scale: Optional[torch.Tensor] = None): + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + assert self.quant_method is not None + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: Add input_scale parameter. + ''' + if smooth_quant_scale is not None: + output_parallel = self.quant_method.apply(self, input_, bias, + input_scale=smooth_quant_scale) + else: + output_parallel = self.quant_method.apply(self, input_, bias) + ''' + ================== + End of MLU Hijack + ================== + ''' + if self.gather_output: + # All-gather across the partitions. + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add tp_group param to tensor_model_parallel_all_gather + ''' + output = tensor_model_parallel_all_gather(output_parallel, self.tp_group) + ''' + ================= + End of MLU Hijack + ================= + ''' + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + +def vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr(self) -> str: + s = f"in_features={self.input_size}" + s += f", output_features={self.output_size_per_partition}" + s += f", bias={self.bias is not None}" + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + ''' + s += f", tp_size={self.tp_world_size}" + ''' + ================= + End of MLU Hijack + ================= + ''' + s += f", gather_output={self.gather_output}" + return s + + +def vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__( + self, + input_size: int, + output_sizes: List[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + tp_group: Any = None, + ): + self.output_sizes = output_sizes + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: move checking output_sizes logic from MergedColumnParallelLinear to ColumnParallelLinear.__init__ + ''' + # tp_size = get_tensor_model_parallel_world_size() + # assert all(output_size % tp_size == 0 for output_size in output_sizes) + ''' + ================= + End of MLU Hijack + ================= + ''' + super(MergedColumnParallelLinear, self).__init__(input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + output_sizes=self.output_sizes, + prefix=prefix, + tp_group=tp_group) + + +def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + # Special case for GGUF + # initialize GGUF param after we know the quantize type + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.data[loaded_shard_id].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + return + + if is_gguf_weight: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + ''' + tp_rank = self.tp_rank + tp_size = self.tp_world_size + ''' + ================= + End of MLU Hijack + ================= + ''' + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // tp_size + start_idx = tp_rank * shard_size + + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 2: + self.qweight = param.materialize_nested() + return + + param_data = param.data + output_dim = getattr(param, "output_dim", None) + # Special case for AQLM codebooks. + is_metadata = getattr(param, "is_metadata", False) + # Special case for per-tensor scale to load scalar into fused array. + needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) + + if loaded_shard_id is None: + # Loaded weight is already fused on disk (qkv/mlp). + if output_dim is None: + if needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, 0) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + current_shard_offset = 0 + shard_offsets: List[Tuple[int, int, int]] = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.pack_factor + shard_offset = shard_offset // param.pack_factor + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + assert loaded_shard_id < len(self.output_sizes) + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + ''' + tp_rank = self.tp_rank + tp_size = self.tp_world_size + ''' + ================= + End of MLU Hijack + ================= + ''' + if output_dim is not None: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size + shard_size = self.output_sizes[loaded_shard_id] // tp_size + # Special case for quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.pack_factor + shard_offset = shard_offset // param.pack_factor + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", + False) + if use_bitsandbytes_4bit: + shard_size = loaded_weight.shape[output_dim] + shard_offset = loaded_weight.shape[output_dim] * \ + loaded_shard_id + + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + start_idx = tp_rank * shard_size + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if not use_bitsandbytes_4bit: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + # Special case for AQLM codebooks. + elif is_metadata: + # metadata indicates fixed size concatenated along dim 0 + shard_size = loaded_weight.shape[0] + shard_offset = loaded_shard_id * shard_size + param_data = param_data.narrow(0, shard_offset, shard_size) + + # Special case for per-tensor scales in fused case. + elif needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, loaded_shard_id) + + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "MergedColumnParallelLinear, assume the weight is " + "the same for all partitions.") + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +def vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2(self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + if loaded_shard_id is None: + if isinstance(param, PerTensorScaleParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=0) + return + elif type(param) in (RowvLLMParameter, BasevLLMParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight) + return + # TODO: @dsikka - move to parameter.py + self._load_fused_module_from_checkpoint(param, loaded_weight) + return + + assert loaded_shard_id < len(self.output_sizes) + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + ''' + tp_size = self.tp_world_size + ''' + ================= + End of MLU Hijack + ================= + ''' + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size + shard_size = self.output_sizes[loaded_shard_id] // tp_size + + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size) + +def vllm__model_executor__layers__linear__RowParallelLinear____init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + tp_group: Any = None, + ): + super(RowParallelLinear, self).__init__(input_size, output_size, skip_bias_add, params_dtype, + quant_config, prefix, tp_group) + + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + + # Divide the weight matrix along the last dimension. + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + ''' + self.tp_size = self.tp_world_size + ''' + ================= + End of MLU Hijack + ================= + ''' + self.input_size_per_partition = divide(input_size, self.tp_size) + assert self.quant_method is not None + + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=[self.output_size], + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) + if not reduce_results and (bias and not skip_bias_add): + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + +def vllm__model_executor__layers__linear__RowParallelLinear__weight_loader( + self, param: Parameter, loaded_weight: torch.Tensor): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + @brief: modify get_tensor_model_parallel_world_size() to self.tp_world_size + ''' + tp_rank = self.tp_rank + tp_size = self.tp_world_size + ''' + ================= + End of MLU Hijack + ================= + ''' + input_dim = getattr(param, "input_dim", None) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + + # Special case for GGUF + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + weight_shape = list(loaded_weight.shape) + if input_dim: + weight_shape[input_dim] = weight_shape[input_dim] // tp_size + param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) + + param_data = param.data + # bitsandbytes loads the weights of the specific portion + # no need to narrow here + if input_dim is not None and not use_bitsandbytes_4bit: + shard_size = param_data.shape[input_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(input_dim, start_idx, + shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +def vllm__model_executor__layers__linear__RowParallelLinear__forward( + self, + input_, + residual: Optional[torch.Tensor] = None +): + if self.input_is_parallel: + input_parallel = input_ + else: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + ''' + tp_rank = self.tp_rank + ''' + ================= + End of MLU Hijack + ================= + ''' + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + residual_ = None if self.tp_rank > 0 else residual + ''' + ===================================================== + Modify by custom vllm_mlu + ===================================================== + @brief: abandon original reduce if parallel_num is set + ''' + is_parallel_enable = hasattr(self.quant_method, 'parallel_num') and get_is_prompt() + ''' + ===================================================== + End of custom MLU Hijack + ===================================================== + ''' + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_, + residual=residual_) + ''' + ============================= + Modify by custom vllm_mlu + ============================= + @brief: when preload_size is set, call GroupCoordinator.all_reduce() directly and + use async_op to set all_reduce paralleled with preload + ''' + if self.reduce_results and self.tp_size > 1 and not is_parallel_enable: + if hasattr(self, 'preload_size') and self.preload_size > 0 and not self.is_prompt: + handle = get_tp_group(self.tp_group).all_reduce(output_parallel, async_op=True) + _MB = 1 << 20 + mlu_ops.preload(self.preloaded_weights[0].data, self.preload_size * _MB) + preloaded_weights_size = self.preloaded_weights[0].numel() * self.preloaded_weights[0].element_size() + if preloaded_weights_size < (self.preload_size * _MB) and len(self.preloaded_weights) > 1: + mlu_ops.preload(self.preloaded_weights[1].data, (self.preload_size * _MB) - preloaded_weights_size) + handle.wait() + output = output_parallel + else: + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add tensor_model_parallel_all_reduce() with self.tp_group + ''' + output = tensor_model_parallel_all_reduce(output_parallel, tp_group=self.tp_group) + ''' + ================= + End of MLU Hijack + ================= + ''' + else: + output = output_parallel + ''' + ========================= + End of custom MLU Hijack + ========================= + ''' + output_bias = self.bias if self.skip_bias_add else None + + return output, output_bias + + +MluHijackObject.apply_hijack(LinearBase, + LinearBase.__init__, + vllm__model_executor__layers__linear__LinearBase____init__) +MluHijackObject.apply_hijack(ColumnParallelLinear, + ColumnParallelLinear.__init__, + vllm__model_executor__layers__linear__ColumnParallelLinear____init__) +MluHijackObject.apply_hijack(ColumnParallelLinear, + ColumnParallelLinear.weight_loader, + vllm__model_executor__layers__linear__ColumnParallelLinear__weight_loader) +MluHijackObject.apply_hijack(ColumnParallelLinear, + ColumnParallelLinear.forward, + vllm__model_executor__layers__linear__ColumnParallelLinear__forward) +MluHijackObject.apply_hijack(ColumnParallelLinear, + ColumnParallelLinear.extra_repr, + vllm__model_executor__layers__linear__ColumnParallelLinear__extra_repr) +MluHijackObject.apply_hijack(MergedColumnParallelLinear, + MergedColumnParallelLinear.__init__, + vllm__model_executor__layers__linear__MergedColumnParallelLinear____init__) +MluHijackObject.apply_hijack(MergedColumnParallelLinear, + MergedColumnParallelLinear.weight_loader, + vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader) +MluHijackObject.apply_hijack(MergedColumnParallelLinear, + MergedColumnParallelLinear.weight_loader_v2, + vllm__model_executor__layers__linear__MergedColumnParallelLinear__weight_loader_v2) +MluHijackObject.apply_hijack(RowParallelLinear, + RowParallelLinear.__init__, + vllm__model_executor__layers__linear__RowParallelLinear____init__) +MluHijackObject.apply_hijack(RowParallelLinear, + RowParallelLinear.weight_loader, + vllm__model_executor__layers__linear__RowParallelLinear__weight_loader) +MluHijackObject.apply_hijack(RowParallelLinear, + RowParallelLinear.forward, + vllm__model_executor__layers__linear__RowParallelLinear__forward) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/parameter.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/parameter.py new file mode 100644 index 0000000..669b479 --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/model_executor/parameter.py @@ -0,0 +1,173 @@ +from fractions import Fraction +from typing import Callable, Optional, Union, Any + +import torch +from torch.nn import Parameter +from vllm.model_executor.parameter import (BasevLLMParameter, + PackedColumnParameter, + PackedvLLMParameter, + PerTensorScaleParameter, + RowvLLMParameter, + _ColumnvLLMParameter) + +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.logger import init_logger +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from ..distributed.parallel_state import (get_parallel_rank_with_group, get_parallel_world_size_with_group) + +logger = init_logger(__name__) + + +def vllm__model_executor__parameter__BasevLLMParameter____init__(self, data: torch.Tensor, weight_loader: Callable, tp_group: Any = None): + """ + Initialize the BasevLLMParameter + + :param data: torch tensor with the parameter data + :param weight_loader: weight loader callable + + :returns: a torch.nn.parameter + """ + + self._weight_loader = weight_loader + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add self.tp_group, world_size and tp_rank to support moe expert parallel + ''' + self.tp_group = tp_group + self.tp_world_size = get_parallel_world_size_with_group(self.tp_group) + self.tp_rank = get_parallel_rank_with_group(self.tp_group) + ''' + ================= + End of MLU Hijack + ================= + ''' + + +def vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight(self, loaded_weight: torch.Tensor): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + ''' + tp_rank = self.tp_rank + ''' + ================= + End of MLU Hijack + ================= + ''' + shard_size = self.data.shape[self.output_dim] + loaded_weight = loaded_weight.narrow(self.output_dim, + tp_rank * shard_size, shard_size) + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + +def vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + + shard_offset = kwargs.get("shard_offset") + shard_size = kwargs.get("shard_size") + if isinstance( + self, + (PackedColumnParameter, + PackedvLLMParameter)) and self.packed_dim == self.output_dim: + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size) + + param_data = self.data + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + ''' + tp_rank = self.tp_rank + ''' + ================= + End of MLU Hijack + ================= + ''' + param_data = param_data.narrow(self.output_dim, shard_offset, + shard_size) + loaded_weight = loaded_weight.narrow(self.output_dim, + tp_rank * shard_size, shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + +def vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + shard_offset = kwargs.get("shard_offset") + shard_size = kwargs.get("shard_size") + shard_id = kwargs.get("shard_id") + num_heads = kwargs.get("num_heads") + + if isinstance( + self, + (PackedColumnParameter, + PackedvLLMParameter)) and self.output_dim == self.packed_dim: + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size) + + param_data = self.data + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + ''' + tp_rank = self.tp_rank + ''' + ================= + End of MLU Hijack + ================= + ''' + shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads + param_data = param_data.narrow(self.output_dim, shard_offset, + shard_size) + loaded_weight = loaded_weight.narrow(self.output_dim, + shard_id * shard_size, shard_size) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +def vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight(self, loaded_weight: torch.Tensor): + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: modify get_tensor_model_parallel_rank() to self.tp_rank + ''' + tp_rank = self.tp_rank + ''' + ================= + End of MLU Hijack + ================= + ''' + shard_size = self.data.shape[self.input_dim] + loaded_weight = loaded_weight.narrow(self.input_dim, + tp_rank * shard_size, shard_size) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + +MluHijackObject.apply_hijack(BasevLLMParameter, + BasevLLMParameter.__init__, + vllm__model_executor__parameter__BasevLLMParameter____init__) +MluHijackObject.apply_hijack(_ColumnvLLMParameter, + _ColumnvLLMParameter.load_column_parallel_weight, + vllm__model_executor__parameter___ColumnvLLMParameter__load_column_parallel_weight) +MluHijackObject.apply_hijack(_ColumnvLLMParameter, + _ColumnvLLMParameter.load_merged_column_weight, + vllm__model_executor__parameter___ColumnvLLMParameter__load_merged_column_weight) +MluHijackObject.apply_hijack(_ColumnvLLMParameter, + _ColumnvLLMParameter.load_qkv_weight, + vllm__model_executor__parameter___ColumnvLLMParameter__load_qkv_weight) +MluHijackObject.apply_hijack(RowvLLMParameter, + RowvLLMParameter.load_row_parallel_weight, + vllm__model_executor__parameter__RowvLLMParameter__load_row_parallel_weight) diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/__init__.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/__init__.py new file mode 100644 index 0000000..4907d3c --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/__init__.py @@ -0,0 +1 @@ +from . import mlu_worker diff --git a/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/mlu_worker.py b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/mlu_worker.py new file mode 100644 index 0000000..714754f --- /dev/null +++ b/vllm-v0.6.2/examples/cambricon_custom_func/vllm/mlu_hijack/worker/mlu_worker.py @@ -0,0 +1,192 @@ +import gc +import os +import torch +from typing import List, Optional, Set, Tuple, Type +from vllm.config import ParallelConfig +from vllm.distributed import init_distributed_environment, set_custom_all_reduce +from vllm.model_executor import set_random_seed +from vllm.worker.mlu_worker import MLUWorker, _check_if_gpu_supports_dtype +from vllm_mlu.worker.mlu_worker import MLUWorker_V2 +from vllm_mlu.mlu_hijack_utils import MluHijackObject +from ..distributed.parallel_state import ensure_model_parallel_initialized + +import functools +from collections import defaultdict +from vllm.logger import init_logger +from vllm_mlu.model_executor.layers.feed_forward import FeedForward +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm_mlu.model_executor.layers.sparse_moe_mlp import SparseMoeMlp +from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size +from ..distributed.parallel_state import (get_moe_tensor_parallel_rank, get_moe_tensor_parallel_world_size, + get_moe_expert_parallel_rank, get_moe_expert_parallel_world_size) + + +logger = init_logger(__name__) + + +def vllm__worker__mlu_worker__init_worker_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, +) -> None: + set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) + + init_distributed_environment(parallel_config.world_size, rank, + distributed_init_method, local_rank, + backend='cncl') + + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: add context_parallel_size, moe_tp_size, moe_ep_size + ''' + ensure_model_parallel_initialized(parallel_config=parallel_config) + ''' + ================== + End of MLU Hijack + ================== + ''' + +def vllm__worker__mlu_worker__MLUWorker__init_device(self) -> None: + if self.device_config.device.type == "mlu": + # torch.distributed.all_reduce does not free the input tensor until + # the synchronization point. This causes the memory usage to grow + # as the number of all_reduce calls increases. This env var disables + # this behavior. + # Related issue: + # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 + os.environ["TORCH_CNCL_AVOID_RECORD_STREAMS"] = "1" + + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("CNCL_ASYNC_ERROR_HANDLING", None) + self.device = torch.device(f"mlu:{self.local_rank}") + torch.mlu.set_device(self.device) + + _check_if_gpu_supports_dtype(self.model_config.dtype) + gc.collect() + torch.mlu.empty_cache() + self.init_gpu_memory = torch.mlu.mem_get_info()[0] + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + ''' + ============================= + Modify by vllm_mlu + ============================= + @brief: @brief: modify to vllm__worker__mlu_worker__init_worker_distributed_environment + ''' + vllm__worker__mlu_worker__init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, self.local_rank) + ''' + ================== + End of MLU Hijack + ================== + ''' + # Set random seed. + set_random_seed(self.model_config.seed) + + +def default_act_range_value(): + return { + "x": None, + "split": None, + "is_linear": False, + "is_qkv": False, + "q_proj_size": 0, + "num_kv_head_replicas": 1, + "is_merge": False, + "input_id": [], + "self_rank": 0, + "rank": None, + "tensor_rank": None, + "tp_world_size": None, + "moe_tp_rank": None, + "moe_tp_world_size": None, + "moe_ep_rank": None, + "moe_ep_world_size": None, + "weight": None, + } + +def vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook(self, + is_save_input_id: bool = False, + is_save_moe_info: bool = False): + model = self.model_runner.model + self.act_range = defaultdict(default_act_range_value) + self.hooks = [] + linear_class_list = (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) + other_class_list = (VocabParallelEmbedding, ParallelLMHead) + class_list = linear_class_list + other_class_list + row_class_list = (RowParallelLinear) + + for name, m in model.named_modules(): + if isinstance(m, FeedForward): + m.use_bt_ffn = False + if isinstance(m, SparseMoeMlp): + m.is_use_fused_moe = False + + if isinstance(m, class_list): + is_linear = True if isinstance(m, linear_class_list) else False + split_type = "row" if isinstance(m, row_class_list) else "col" + self.act_range[name]["split"] = split_type + self.act_range[name]["is_linear"] = is_linear + if isinstance(m, QKVParallelLinear): + self.act_range[name]["is_qkv"] = True + self.act_range[name]["q_proj_size"] = m.num_heads * m.head_size + self.act_range[name]["num_kv_head_replicas"] = m.num_kv_head_replicas + self.act_range[name]["is_merge"] = isinstance(m, MergedColumnParallelLinear) + if is_save_moe_info: + self.act_range[name]["rank"] = torch.distributed.get_rank() + self.act_range[name]["tensor_rank"] = get_tensor_model_parallel_rank() + self.act_range[name]["tp_world_size"] = get_tensor_model_parallel_world_size() + self.act_range[name]["moe_tp_rank"] = get_moe_tensor_parallel_rank() + self.act_range[name]["moe_tp_world_size"] = get_moe_tensor_parallel_world_size() + self.act_range[name]["moe_ep_rank"] = get_moe_expert_parallel_rank() + self.act_range[name]["moe_ep_world_size"] = get_moe_expert_parallel_world_size() + if ".expert." in name: + self.act_range[name]["weight"] = m.weight + logger.info(f"rank:{self.rank}, add hook to {name}, is_linear:{is_linear}, split_type:{split_type}") + self.hooks.append( + m.register_forward_hook( + functools.partial(self.stat_input_hook, + name=name, + act_range=self.act_range, + is_linear=is_linear, + is_save_input_id=is_save_input_id))) + + +def vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range(self): + act_range = defaultdict(default_act_range_value) + for layer_name, layer_range in self.act_range.items(): + for tensor_key, tensor_value in layer_range.items(): + if isinstance(tensor_value, torch.Tensor): + act_range[layer_name][tensor_key] = tensor_value.to("cpu") + elif tensor_key == "input_id" and isinstance(tensor_value, list): + input_id_len = len(tensor_value) + for i in range(input_id_len): + if isinstance(tensor_value[i], torch.Tensor): + act_range[layer_name][tensor_key].append(tensor_value[i].to("cpu")) + else: + act_range[layer_name][tensor_key].append(tensor_value[i]) + else: + act_range[layer_name][tensor_key] = tensor_value + + return act_range + + +MluHijackObject.apply_hijack(MLUWorker, + MLUWorker.init_device, + vllm__worker__mlu_worker__MLUWorker__init_device) +MluHijackObject.apply_hijack(MLUWorker, + "setup_smooth_hook", + vllm_mlu__worker__mlu_worker__MLUWorker_V2__setup_smooth_hook) +MluHijackObject.apply_hijack(MLUWorker, + "get_act_range", + vllm_mlu__worker__mlu_worker__MLUWorker_V2__get_act_range) diff --git a/vllm-v0.6.2/examples/cpu_offload.py b/vllm-v0.6.2/examples/cpu_offload.py new file mode 100644 index 0000000..b152e5b --- /dev/null +++ b/vllm-v0.6.2/examples/cpu_offload.py @@ -0,0 +1,22 @@ +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/florence2_inference.py b/vllm-v0.6.2/examples/florence2_inference.py new file mode 100644 index 0000000..b58ac2e --- /dev/null +++ b/vllm-v0.6.2/examples/florence2_inference.py @@ -0,0 +1,44 @@ +''' +Demonstrate prompting of text-to-text +encoder/decoder models, specifically Florence-2 +''' +# TODO(Isotr0py): +# Move to offline_inference_vision_language.py after porting vision backbone +from vllm import LLM, SamplingParams + +dtype = "float" + +# Create a Florence-2 encoder/decoder model instance +llm = LLM( + model="microsoft/Florence-2-base", + tokenizer="facebook/bart-base", + dtype=dtype, + trust_remote_code=True, +) + +prompts = [ + "", "", "", + "", "", "", + "", "", "" +] +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, +) + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/fp8/README.md b/vllm-v0.6.2/examples/fp8/README.md new file mode 100644 index 0000000..181c365 --- /dev/null +++ b/vllm-v0.6.2/examples/fp8/README.md @@ -0,0 +1,96 @@ +# FP8 KV Cache + +This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. + +## Prerequisites + +- Python 3.x +- PyTorch +- NumPy +- Hugging Face Transformers +- Hugging Face Hub +- AMMO + +Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps: +1. Install all necessary prerequisites and dependencies. +2. Convert HF model into a quantized HF model. +3. Extract KV Cache Scaling Factors from quantized HF model. +4. Load KV Cache Scaling Factors into VLLM. + +### 2. Convert HF model into a quantized HF model. +Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). + +`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). + +The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`. + +### 3. Extract KV Cache Scaling Factors from quantized HF model. +`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: +1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. + +2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. + +3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks. + +```python +# prerequisites: +# - Quantized HF LLaMa 2 model +python3 examples/fp8/extract_scales.py --help +Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] + +KV Scale Extraction Example + +optional arguments: +--quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). +Optional arguments: +--cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) +--load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) +--revision: Specify the model's revision number. (Default: None) +--output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) +--output_name: Specify the output filename. (Default: kv_cache_scales.json) +--tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) +``` +```python +Example: +python3 examples/fp8/extract_scales.py --quantized_model --tp_size --output_dir +``` +### 4. Load KV Cache Scaling Factors into VLLM. +This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. +```python +# prerequisites: +# - LLaMa 2 kv_cache_scales.json file + +python3 benchmarks/benchmark_throughput.py --help +usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] + [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] + [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] + [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] + [--quantization-param-path KV_CACHE_quantization_param_path] + +Benchmark Throughput Example +optional arguments: + -h, --help show this help message and exit + --backend {vllm,hf,mii} + --dataset DATASET Path to the dataset. + --input-len INPUT_LEN Input prompt length for each request + --output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset. + --model MODEL + --tokenizer TOKENIZER + --quantization {awq,gptq,None}, -q {awq,gptq,None} + --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE + --n N Number of generated sequences per prompt. + --use-beam-search + --num-prompts NUM_PROMPTS Number of prompts to process. + --seed SEED + --hf-max-batch-size HF_MAX_BATCH_SIZE Maximum batch size for HF backend. + --trust-remote-code trust remote code from huggingface + --max-model-len MAX_MODEL_LEN Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + --dtype {auto,half,float16,bfloat16,float,float32} data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. + --enforce-eager enforce eager execution + --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. + --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. +``` +``` +Example: +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --quantization-param-path --model +```python diff --git a/vllm-v0.6.2/examples/fp8/extract_scales.py b/vllm-v0.6.2/examples/fp8/extract_scales.py new file mode 100644 index 0000000..1dce9d7 --- /dev/null +++ b/vllm-v0.6.2/examples/fp8/extract_scales.py @@ -0,0 +1,367 @@ +import argparse +import glob +import json +import os +from typing import Any, Callable, Dict, List, Optional, Tuple + +import numpy as np +import torch +from safetensors.torch import safe_open + +from vllm.model_executor.layers.quantization.schema import QuantParamSchema + + +# Adapted from vllm/model_executor/model_loader/weight_utils.py +# The main differences are that we add the NPZ format and simplify +# its functionality drastically for our purposes (e.g. we assume that +# the quantized model exists locally and there is no need to download it) +def _prepare_hf_weights( + quantized_model_dir: str, + load_format: str = "auto", + fall_back_to_pt: bool = True, +) -> Tuple[List[str], bool]: + if not os.path.isdir(quantized_model_dir): + raise FileNotFoundError( + f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") + use_safetensors = False + # Some quantized models use .pt files for storing the weights. + if load_format == "auto": + allow_patterns = ["*.safetensors", "*.bin"] + elif load_format == "safetensors": + use_safetensors = True + allow_patterns = ["*.safetensors"] + elif load_format == "pt": + allow_patterns = ["*.pt"] + elif load_format == "npz": + allow_patterns = ["*.npz"] + else: + raise ValueError(f"Unknown load_format: {load_format}") + if fall_back_to_pt: + allow_patterns += ["*.pt"] + + hf_weights_files: List[str] = [] + for pattern in allow_patterns: + hf_weights_files += glob.glob( + os.path.join(quantized_model_dir, pattern)) + if len(hf_weights_files) > 0: + if pattern == "*.safetensors": + use_safetensors = True + break + + if not use_safetensors: + # Exclude files that are not needed for inference. + # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 + blacklist = [ + "training_args.bin", + "optimizer.bin", + "optimizer.pt", + "scheduler.pt", + "scaler.pt", + ] + hf_weights_files = [ + f for f in hf_weights_files + if not any(f.endswith(x) for x in blacklist) + ] + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{quantized_model_dir}`") + + return hf_weights_files, use_safetensors + + +# Adapted from vllm/model_executor/model_loader/weight_utils.py +def _hf_tensorfile_iterator(filename: str, load_format: str, + use_safetensors: bool): + if load_format == "npz": + assert not use_safetensors + with np.load(filename) as data: + for name in data.files: + param = torch.from_numpy(data[name]) + yield name, param + elif use_safetensors: + with safe_open(filename, framework="pt") as f: + for name in f.keys(): # NOQA: SIM118 + param = f.get_tensor(name) + yield name, param + else: + state = torch.load(filename, map_location="cpu") + for name, param in state.items(): + yield name, param + del state + torch.cuda.empty_cache() + + +def _kv_scales_extractor( + hf_tensor_files: List[str], + use_safetensors: bool, + rank_keyword: str = "rank", + expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: + """ + Given a list of files containing tensor data, attempt to extract KV cache + scales from these files. Intended as a helper function taking in the output + from _prepare_hf_weights. + Args: + rank_keyword Matches the number immediately after this keyword in the + tensor filename to determine the TP rank corresponding + to said tensor file + expected_tp_size If specified, the TP size of the tensor files is checked + against this and an error is raised if they don't match. + Returns a dictionary mapping TP ranks to their relevant KV cache scales. + The per-rank scales are themselves represented as a dictionary of layer + indices to the respective per-layer scale. + """ + for char in rank_keyword: + assert not char.isdecimal( + ), f"Rank keyword {rank_keyword} contains a numeric character!" + rank_scales_map: Dict[int, Dict[int, float]] = {} + for tensor_file in hf_tensor_files: + try: + rank_idx = tensor_file.find(rank_keyword) + if rank_idx != -1: + start_idx = rank_idx + len(rank_keyword) + stop_idx = start_idx + while stop_idx < len( + tensor_file) and tensor_file[stop_idx].isdecimal(): + stop_idx += 1 + if stop_idx == start_idx: + raise RuntimeError("Did not find rank # in filename.") + rank = int(tensor_file[start_idx:stop_idx]) + elif len(hf_tensor_files) == 1: + # Since there is only one tensor file, we can assume + # that it's intended for TP rank 0 + rank = 0 + else: + raise RuntimeError( + f"Filename does not contain '{rank_keyword}'.") + except RuntimeError: + print("Unable to determine TP rank " + f"corresponding to file '{tensor_file}'") + raise + + if rank not in rank_scales_map: + layer_scales_map: Dict[int, float] = {} + rank_scales_map[rank] = layer_scales_map + else: + raise RuntimeError( + f"Tensor file '{tensor_file}' shares TP rank {rank} " + "with another tensor file.") + + module_delimiter = ":" if args.load_format == "npz" else "." + for name, param in _hf_tensorfile_iterator(tensor_file, + args.load_format, + use_safetensors): + if "kv_cache_scaling_factor" in name: + nums = [ + int(s) for s in name.split(module_delimiter) + if s.isdecimal() + ] + assert len( + nums) == 1, f"Could not determine layer idx for {name}" + layer_idx = nums[0] + assert layer_idx not in layer_scales_map, f"Duplicate scaling"\ + f" factor corresponding to layer {layer_idx}" + try: + layer_scales_map[layer_idx] = param.item() + except RuntimeError: + print( + "This utility supports only per-tensor scalar scales " + f"for now. The tensor\n {name} = {param} \nis an " + "invalid scale factor.") + raise + + if all( + len(layer_scales_map) == 0 + for layer_scales_map in rank_scales_map.values()): + # Note: this is true even if the rank_scales_map is empty + print("WARNING: No KV cache scale factors found. No output saved.") + return None + empirical_tp_world_size = max(rank_scales_map.keys()) + 1 + if expected_tp_size is not None: + assert expected_tp_size == empirical_tp_world_size, \ + f"User expected TP world size = {expected_tp_size} " \ + "from model but tool is expecting TP world size = " \ + f"{empirical_tp_world_size} from model instead." + for i in range(empirical_tp_world_size): + assert i in rank_scales_map, "Expected TP world size = "\ + f"{empirical_tp_world_size} but did not find KV " \ + f"cache scaling factors for TP rank {i}" + print(f"Found TP world size = {empirical_tp_world_size} " + "when extracting KV cache scales!") + return rank_scales_map + + +def _metadata_extractor(quantized_model_dir: str, + metadata_extract_fns: \ + Dict[str, Callable[[Dict[str, Any]], Any]]) \ + -> Dict[str, Any]: + """ + Given a directory containing quantized model files, this function + aims to extract metadata from the JSON files within this directory. + Each JSON file is expected to represent a dictionary in JSON + format (referred to as a "JSON-dictionary"). Metadata extraction is + defined by a dictionary called metadata_extract_fns, where each + metadata field name is mapped to an extraction function. + + These extraction functions are designed to take a JSON-dictionary + as their only argument and return the corresponding metadata. + While extraction functions are permitted to raise exceptions, they + should only raise a KeyError or ValueError if the metadata field + cannot be extracted from the current JSON-dictionary, yet there's + a possibility of finding it in another JSON-dictionary. + + The function returns a dictionary that maps metadata fields to + their extracted data. The keys of this dictionary correspond exactly + to those in metadata_extract_fns. If any fields fail to be extracted, + their corresponding values are set to None, and a warning is printed. + """ + if not os.path.isdir(quantized_model_dir): + raise FileNotFoundError( + f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") + metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) + + result: Dict[str, Any] = {} + for file in metadata_files: + with open(file) as f: + try: + metadata = json.load(f) + except json.JSONDecodeError: + print(f"Could not parse `{file}` as a valid metadata file," + " skipping it.") + continue + if not isinstance(metadata, dict): + print(f"The file `{file}` does not correspond to a " + "JSON-serialized dictionary, skipping it.") + continue + for metadata_name, extract_fn in metadata_extract_fns.items(): + try: + metadata_info = extract_fn(metadata) + if metadata_name not in result: + result[metadata_name] = metadata_info + elif metadata_info != result[metadata_name]: + raise RuntimeError( + "Metadata mismatch! Originally found " + f"{metadata_name} = {result[metadata_name]} but " + f"now found {metadata_name} = {metadata_info} in " + f"`{file}`") + except KeyError: + # It is possible that a given file does not contain some + # of our selected metadata as it could be located in some + # other metadata file. + # 'EFINAE': extract_fn failure is not an error. + pass + except ValueError: + # See above. + pass + + # Warn if we cannot find any of the requested metadata + for metadata_name in metadata_extract_fns: + if metadata_name not in result: + print("WARNING: Unable to find requested metadata field " + f"`{metadata_name}`, setting it to None.") + result[metadata_name] = None + + return result + + +def main(args): + metadata_extract_fns = { + "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"], + "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]), + "model_dtype": lambda json_dict: json_dict["dtype"] + } + recovered_metadata = _metadata_extractor(args.quantized_model, + metadata_extract_fns) + if args.tp_size is not None: + metadata_tp_size = recovered_metadata["tp_size"] + if metadata_tp_size is not None: + assert args.tp_size == metadata_tp_size, \ + f"User expected TP world size = {args.tp_size} " \ + f"but found TP world size = {metadata_tp_size} from metadata!" + expected_tp_size = args.tp_size or recovered_metadata["tp_size"] + rank_keyword = "rank" + hf_tensor_files, use_safetensors = _prepare_hf_weights( + args.quantized_model, args.load_format) + rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, + rank_keyword, expected_tp_size) + # Postprocess: formatting to the current schema. Consider pulling it + # out into a dedicated function should it ever become more complicated. + rank_scales_map = { + rank: {k: scale[k] + for k in sorted(scale.keys())} + for rank, scale in rank_scales_map.items() + } + # TODO: Expand this with activation and weights scaling factors when + # they are used in the future + schema = QuantParamSchema( + model_type=recovered_metadata["model_type"], + kv_cache={ + "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else + recovered_metadata["model_dtype"]), + "scaling_factor": + rank_scales_map + }, + ) + + if args.output_dir is None: + output_file = os.path.join(args.quantized_model, args.output_name) + else: + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir, exist_ok=True) + output_file = os.path.join(args.output_dir, args.output_name) + + with open(output_file, 'w') as f: + f.write(schema.model_dump_json(indent=4)) + print(f"Completed! KV cache scaling factors saved to {output_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="This simple utility extracts the " + "KV cache scaling factors from a quantized HF model " + "and saves them to a JSON file compatible with later " + "use by vLLM (pass this file to the appropriate " + "runtime typically using the argument " + "--quantization-param-path ). This is only used " + "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") + parser.add_argument( + "--quantized-model", + help="Specify the directory containing a single quantized HF model. " + "It is expected that the quantization format is FP8_E4M3, for use " + "on ROCm (AMD GPU).", + required=True) + parser.add_argument( + "--load_format", + help="Optionally specify the format of the model's tensor files " + "containing the KV cache scaling factors.", + choices=["auto", "safetensors", "npz", "pt"], + default="auto") + parser.add_argument( + "--output-dir", + help="Optionally specify the output directory. By default the " + "KV cache scaling factors will be saved in the model directory, " + "however you can override this behavior here.", + default=None) + parser.add_argument( + "--output-name", + help="Optionally specify the output filename.", + # TODO: Change this once additional scaling factors are enabled + default="kv_cache_scales.json") + parser.add_argument( + "--tp-size", + help="Optionally specify the tensor-parallel (TP) size that the " + "quantized model should correspond to. If specified, during KV " + "cache scaling factor extraction the observed TP size will be " + "checked against this and an error will be raised if there is " + "a mismatch. If not specified, the quantized model's expected " + "TP size is instead inferred from the largest TP rank observed. " + "The expected TP size is cross-checked against the TP ranks " + "observed in the quantized model and an error is raised if any " + "discrepancies are found.", + default=None, + type=int) + args = parser.parse_args() + + main(args) diff --git a/vllm-v0.6.2/examples/fp8/quantizer/README.md b/vllm-v0.6.2/examples/fp8/quantizer/README.md new file mode 100644 index 0000000..d0895e9 --- /dev/null +++ b/vllm-v0.6.2/examples/fp8/quantizer/README.md @@ -0,0 +1,32 @@ +### Quantizer Utilities +`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported +from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py) + +### Prerequisite + +#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later +`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` + +#### AMMO Download (code and docs) +`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` +`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` + +### Usage + +#### Run on H100 system for speed if FP8; number of GPUs depends on the model size + +#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: +`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1` + +Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) +``` +# ll ./ll2_7b_fp8/ +total 19998244 +drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ +drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ +-rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json +-rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz +-rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors +# +``` + diff --git a/vllm-v0.6.2/examples/fp8/quantizer/quantize.py b/vllm-v0.6.2/examples/fp8/quantizer/quantize.py new file mode 100644 index 0000000..d75cc8b --- /dev/null +++ b/vllm-v0.6.2/examples/fp8/quantizer/quantize.py @@ -0,0 +1,367 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501 +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Adapted from examples/quantization/hf_ptq.py +""" + +import argparse +import copy +import json +import random +import time + +import ammo.torch.quantization as atq +import numpy as np +import torch +from ammo.torch.export import export_model_config +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +RAND_SEED = 1234 +MAX_SEQ_LEN = 2048 + +EMPTY_CFG = { + "quant_cfg": { + "*weight_quantizer": { + "enable": False, + }, + "*input_quantizer": { + "enable": False + }, + "*lm_head*": { + "enable": False + }, + "*output_layer*": { + "enable": False + }, + "default": { + "enable": False + }, + }, + "algorithm": "max", +} + +KV_CACHE_CFG = { + "*.query_key_value.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.Wqkv.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.W_pack.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.c_attn.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.k_proj.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.v_proj.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, +} + +QUANT_CFG_CHOICES = { + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, + "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, + "int8_wo": EMPTY_CFG, + "int4_wo": EMPTY_CFG, + "full_prec": EMPTY_CFG, +} + +MODEL_NAME_PATTERN_MAP = { + "GPT2": "gpt2", + "Xverse": "llama", + "Llama": "llama", + "Mistral": "llama", + "GPTJ": "gptj", + "FalconForCausalLM": "falcon", + "RWForCausalLM": "falcon", + "baichuan": "baichuan", + "MPT": "mpt", + "Bloom": "bloom", + "ChatGLM": "chatglm", + "QWen": "qwen", +} + + +def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None): + print(f"Initializing tokenizer from {ckpt_path}") + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, + model_max_length=max_seq_len, + padding_side="left", + trust_remote_code=True, + ) + if model_type and model_type == "qwen": + # qwen use token id 151643 as pad and eos tokens + tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) + tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) + + # can't set attribute 'pad_token' for "" + if tokenizer.pad_token != "": + tokenizer.pad_token = tokenizer.eos_token + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + assert (tokenizer.pad_token + is not None), f"Pad token for {model_type} cannot be set!" + + return tokenizer + + +def get_model(ckpt_path, dtype="fp16", device="cuda"): + print(f"Initializing model from {ckpt_path}") + if dtype == "bf16" or dtype == "bfloat16": + dtype = torch.bfloat16 + elif dtype == "fp16" or dtype == "float16": + dtype = torch.float16 + elif dtype == "fp32" or dtype == "float32": + dtype = torch.float32 + else: + raise NotImplementedError(f"Unknown dtype {dtype}") + + # model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"torch_dtype": "auto"} + + model = AutoModelForCausalLM.from_pretrained(ckpt_path, + device_map="auto", + **model_kwargs, + trust_remote_code=True) + model.eval() + + model_dtype = next(model.parameters()).dtype + if dtype != model_dtype: + print("[TensorRT-LLM][WARNING] The manually set model data type is " + f"{dtype}, but the data type of the HuggingFace model is " + f"{model_dtype}.") + + return model + + +def get_model_type(model): + for k, v in MODEL_NAME_PATTERN_MAP.items(): + if k.lower() in type(model).__name__.lower(): + return v + return None + + +def get_calib_dataloader(data="cnn_dailymail", + tokenizer=None, + batch_size=1, + calib_size=512, + block_size=512, + device=None): + print("Loading calibration dataset") + if data == "pileval": + dataset = load_dataset( + "json", + data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", + split="train") + dataset = dataset["text"][:calib_size] + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + dataset = dataset["article"][:calib_size] + else: + raise NotImplementedError + + batch_encoded = tokenizer.batch_encode_plus(dataset, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=block_size) + if device: + batch_encoded = batch_encoded.to(device) + batch_encoded = batch_encoded["input_ids"] + + calib_dataloader = DataLoader(batch_encoded, + batch_size=batch_size, + shuffle=False) + + return calib_dataloader + + +def quantize_model(model, quant_cfg, calib_dataloader=None): + + def calibrate_loop(): + if calib_dataloader is None: + return + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + print(f"Calibrating batch {idx}") + model(data) + + print("Starting quantization...") + start_time = time.time() + atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + end_time = time.time() + print("Quantization done. Total time used: {:.2f} s.".format(end_time - + start_time)) + + return model + + +def main(args): + if not torch.cuda.is_available(): + raise OSError("GPU is required for inference.") + + random.seed(RAND_SEED) + np.random.seed(RAND_SEED) + + model = get_model(args.model_dir, args.dtype, args.device) + model_type = get_model_type(model) + tokenizer = get_tokenizer(args.model_dir, model_type=model_type) + + if args.qformat in ["full_prec", "int8_wo", "int4_wo" + ] and args.kv_cache_dtype is None: + print(f"No quantization applied, export {args.dtype} model") + else: + if "awq" in args.qformat: + if args.calib_size > 32: + print("AWQ calibration could take longer with calib_size = " + f"{args.calib_size}, Using calib_size=32 instead") + args.calib_size = 32 + print("\nAWQ calibration could take longer than other calibration " + "methods. Please increase the batch size to speed up the " + "calibration process. Batch size can be set by adding the " + "argument --batch_size to the command line.\n") + + calib_dataloader = get_calib_dataloader( + tokenizer=tokenizer, + batch_size=args.batch_size, + calib_size=args.calib_size, + device=args.device, + ) + + if args.qformat in QUANT_CFG_CHOICES: + quant_cfg = QUANT_CFG_CHOICES[args.qformat] + else: + raise ValueError( + f"Unsupported quantization format: {args.qformat}") + + if "awq" in args.qformat: + quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat]) + weight_quantizer = quant_cfg["quant_cfg"][ + "*weight_quantizer"] # type: ignore + if isinstance(weight_quantizer, list): + weight_quantizer = weight_quantizer[0] + weight_quantizer["block_sizes"][-1] = args.awq_block_size + + if args.kv_cache_dtype is not None: + if args.kv_cache_dtype == "fp8": + for value in KV_CACHE_CFG.values(): + value.update({"num_bits": (4, 3)}) # type: ignore + quant_cfg["quant_cfg"].update(KV_CACHE_CFG) # type: ignore + + print(quant_cfg) + + model = quantize_model(model, quant_cfg, calib_dataloader) + + with torch.inference_mode(): + if model_type is None: + print(f"Unknown model type {type(model).__name__}. Continue " + "exporting...") + model_type = f"unknown:{type(model).__name__}" + + export_path = args.output_dir + start_time = time.time() + + if args.qformat == "int4_awq" and model_type == "qwen": + torch.save(model.state_dict(), export_path) + else: + export_npz = (model_type not in [ + 'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan' + ]) + + # export safetensors + export_model_config( + model, + model_type, + getattr(torch, args.dtype), + export_dir=export_path, + inference_tensor_parallel=args.tp_size, + inference_pipeline_parallel=args.pp_size, + # export_tensorrt_llm_config=(not export_npz), + export_tensorrt_llm_config=False, + export_npz=export_npz) + + # Workaround for wo quantization + if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: + with open(f"{export_path}/config.json") as f: + tensorrt_llm_config = json.load(f) + if args.qformat == "int8_wo": + tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' + elif args.qformat == "int4_wo": + tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16' + else: + tensorrt_llm_config["quantization"]["quant_algo"] = None + with open(f"{export_path}/config.json", "w") as f: + json.dump(tensorrt_llm_config, f, indent=4) + + end_time = time.time() + print("Quantized model exported to {} \nTotal time used {:.2f} s.". + format(export_path, end_time - start_time)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--model-dir", + help="Specify where the HuggingFace model is", + required=True) + parser.add_argument("--device", default="cuda") + parser.add_argument("--dtype", help="Model data type.", default="float16") + parser.add_argument( + "--qformat", + help="Quantization format.", + default="full_prec", + choices=[ + "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", + "full_prec" + ], + ) + parser.add_argument("--batch-size", + help="Batch size for calibration.", + type=int, + default=1) + parser.add_argument("--calib-size", + help="Number of samples for calibration.", + type=int, + default=512) + parser.add_argument("--output-dir", default="exported_model") + parser.add_argument("--tp-size", type=int, default=1) + parser.add_argument("--pp-size", type=int, default=1) + parser.add_argument("--awq-block-size", type=int, default=128) + parser.add_argument("--kv-cache-dtype", + help="KV Cache dtype.", + default=None, + choices=["int8", "fp8", None]) + args = parser.parse_args() + + main(args) diff --git a/vllm-v0.6.2/examples/gguf_inference.py b/vllm-v0.6.2/examples/gguf_inference.py new file mode 100644 index 0000000..09a5fcc --- /dev/null +++ b/vllm-v0.6.2/examples/gguf_inference.py @@ -0,0 +1,38 @@ +from huggingface_hub import hf_hub_download + +from vllm import LLM, SamplingParams + + +def run_gguf_inference(model_path): + PROMPT_TEMPLATE = "<|system|>\n{system_message}
\n<|user|>\n{prompt}\n<|assistant|>\n" # noqa: E501 + system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501 + # Sample prompts. + prompts = [ + "How many helicopters can a human eat in one sitting?", + "What's the future of AI?", + ] + prompts = [ + PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt) + for prompt in prompts + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0, max_tokens=128) + + # Create an LLM. + llm = LLM(model=model_path, + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + gpu_memory_utilization=0.95) + + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" + filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" + model = hf_hub_download(repo_id, filename=filename) + run_gguf_inference(model) diff --git a/vllm-v0.6.2/examples/gradio_openai_chatbot_webserver.py b/vllm-v0.6.2/examples/gradio_openai_chatbot_webserver.py new file mode 100644 index 0000000..8ceb8f6 --- /dev/null +++ b/vllm-v0.6.2/examples/gradio_openai_chatbot_webserver.py @@ -0,0 +1,82 @@ +import argparse + +import gradio as gr +from openai import OpenAI + +# Argument parser setup +parser = argparse.ArgumentParser( + description='Chatbot Interface with Customizable Parameters') +parser.add_argument('--model-url', + type=str, + default='http://localhost:8000/v1', + help='Model URL') +parser.add_argument('-m', + '--model', + type=str, + required=True, + help='Model name for the chatbot') +parser.add_argument('--temp', + type=float, + default=0.8, + help='Temperature for text generation') +parser.add_argument('--stop-token-ids', + type=str, + default='', + help='Comma-separated stop token IDs') +parser.add_argument("--host", type=str, default=None) +parser.add_argument("--port", type=int, default=8001) + +# Parse the arguments +args = parser.parse_args() + +# Set OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = args.model_url + +# Create an OpenAI client to interact with the API server +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + + +def predict(message, history): + # Convert chat history to OpenAI format + history_openai_format = [{ + "role": "system", + "content": "You are a great ai assistant." + }] + for human, assistant in history: + history_openai_format.append({"role": "user", "content": human}) + history_openai_format.append({ + "role": "assistant", + "content": assistant + }) + history_openai_format.append({"role": "user", "content": message}) + + # Create a chat completion request and send it to the API server + stream = client.chat.completions.create( + model=args.model, # Model name to use + messages=history_openai_format, # Chat history + temperature=args.temp, # Temperature for text generation + stream=True, # Stream response + extra_body={ + 'repetition_penalty': + 1, + 'stop_token_ids': [ + int(id.strip()) for id in args.stop_token_ids.split(',') + if id.strip() + ] if args.stop_token_ids else [] + }) + + # Read and return generated text from response stream + partial_message = "" + for chunk in stream: + partial_message += (chunk.choices[0].delta.content or "") + yield partial_message + + +# Create and launch a chat interface with Gradio +gr.ChatInterface(predict).queue().launch(server_name=args.host, + server_port=args.port, + share=True) diff --git a/vllm-v0.6.2/examples/gradio_webserver.py b/vllm-v0.6.2/examples/gradio_webserver.py new file mode 100644 index 0000000..54e9075 --- /dev/null +++ b/vllm-v0.6.2/examples/gradio_webserver.py @@ -0,0 +1,52 @@ +import argparse +import json + +import gradio as gr +import requests + + +def http_bot(prompt): + headers = {"User-Agent": "vLLM Client"} + pload = { + "prompt": prompt, + "stream": True, + "max_tokens": 128, + } + response = requests.post(args.model_url, + headers=headers, + json=pload, + stream=True) + + for chunk in response.iter_lines(chunk_size=8192, + decode_unicode=False, + delimiter=b"\0"): + if chunk: + data = json.loads(chunk.decode("utf-8")) + output = data["text"][0] + yield output + + +def build_demo(): + with gr.Blocks() as demo: + gr.Markdown("# vLLM text completion demo\n") + inputbox = gr.Textbox(label="Input", + placeholder="Enter text and press ENTER") + outputbox = gr.Textbox(label="Output", + placeholder="Generated result from the model") + inputbox.submit(http_bot, [inputbox], [outputbox]) + return demo + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default=None) + parser.add_argument("--port", type=int, default=8001) + parser.add_argument("--model-url", + type=str, + default="http://localhost:8000/generate") + args = parser.parse_args() + + demo = build_demo() + demo.queue().launch(server_name=args.host, + server_port=args.port, + share=True) diff --git a/vllm-v0.6.2/examples/llava_example.py b/vllm-v0.6.2/examples/llava_example.py new file mode 100644 index 0000000..4b971de --- /dev/null +++ b/vllm-v0.6.2/examples/llava_example.py @@ -0,0 +1,34 @@ +from vllm import LLM, SamplingParams +from PIL import Image +from dataclasses import dataclass +from typing import Literal + + +@dataclass(frozen=True) +class ImageAssetLocal: + name: Literal["stop_sign", "cherry_blossom"] + @property + def pil_image(self) -> Image.Image: + return Image.open(f"tools/ci/ci_files/{self.name}.jpg") + + +def run_llava(): + llm = LLM(model="/data/AE/llm/models/llava-1.5-7b-hf/") + sampling_params = SamplingParams(max_tokens=100) + + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + image = ImageAssetLocal("stop_sign").pil_image + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + }, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + run_llava() diff --git a/vllm-v0.6.2/examples/llm_engine_example.py b/vllm-v0.6.2/examples/llm_engine_example.py new file mode 100644 index 0000000..60d894a --- /dev/null +++ b/vllm-v0.6.2/examples/llm_engine_example.py @@ -0,0 +1,60 @@ +import argparse +from typing import List, Tuple + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.utils import FlexibleArgumentParser + + +def create_test_prompts() -> List[Tuple[str, SamplingParams]]: + """Create a list of test prompts with their sampling parameters.""" + return [ + ("A robot may not injure a human being", + SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), + ("To be or not to be,", + SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), + ("What is the meaning of life?", + SamplingParams(n=2, + best_of=5, + temperature=0.8, + top_p=0.95, + frequency_penalty=0.1)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params = test_prompts.pop(0) + engine.add_request(str(request_id), prompt, sampling_params) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + + for request_output in request_outputs: + if request_output.finished: + print(request_output) + + +def initialize_engine(args: argparse.Namespace) -> LLMEngine: + """Initialize the LLMEngine from the command line arguments.""" + engine_args = EngineArgs.from_cli_args(args) + return LLMEngine.from_engine_args(engine_args) + + +def main(args: argparse.Namespace): + """Main function that sets up and runs the prompt processing.""" + engine = initialize_engine(args) + test_prompts = create_test_prompts() + process_requests(engine, test_prompts) + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description='Demo on using the LLMEngine class directly') + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/examples/logging_configuration.md b/vllm-v0.6.2/examples/logging_configuration.md new file mode 100644 index 0000000..0d278b0 --- /dev/null +++ b/vllm-v0.6.2/examples/logging_configuration.md @@ -0,0 +1,172 @@ +# Logging Configuration + +vLLM leverages Python's `logging.config.dictConfig` functionality to enable +robust and flexible configuration of the various loggers used by vLLM. + +vLLM offers two environment variables that can be used to accommodate a range +of logging configurations that range from simple-and-inflexible to +more-complex-and-more-flexible. + +- No vLLM logging (simple and inflexible) + - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset) +- vLLM's default logging configuration (simple and inflexible) + - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` +- Fine-grained custom logging configuration (more complex, more flexible) + - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and + set `VLLM_LOGGING_CONFIG_PATH=` + + +## Logging Configuration Environment Variables + +### `VLLM_CONFIGURE_LOGGING` + +`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to +configure the loggers used by vLLM. This functionality is enabled by default, +but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM. + +If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for +`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to +configure the root vLLM logger. By default, no other vLLM loggers are +configured and, as such, all vLLM loggers defer to the root vLLM logger to make +all logging decisions. + +If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for +`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM. + +### `VLLM_LOGGING_CONFIG_PATH` + +`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of +alternative, custom logging configuration that will be used instead of vLLM's +built-in default logging configuration. The logging configuration should be +provided in JSON format following the schema specified by Python's [logging +configuration dictionary +schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details). + +If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is +disabled, an error will occur while starting vLLM. + + +## Examples + +### Example 1: Customize vLLM root logger + +For this example, we will customize the vLLM root logger to use +[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to +STDOUT of the console in JSON format with a log level of `INFO`. + +To begin, first, create an appropriate JSON logging configuration file: + +**/path/to/logging_config.json:** + +```json +{ + "formatters": { + "json": { + "class": "pythonjsonlogger.jsonlogger.JsonFormatter" + } + }, + "handlers": { + "console": { + "class" : "logging.StreamHandler", + "formatter": "json", + "level": "INFO", + "stream": "ext://sys.stdout" + } + }, + "loggers": { + "vllm": { + "handlers": ["console"], + "level": "INFO", + "propagate": false + } + }, + "version": 1 +} +``` + +Next, install the `python-json-logger` package if it's not already installed: + +```bash +pip install python-json-logger +``` + +Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set +to the path of the custom logging configuration JSON file: + +```bash +VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \ + vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 +``` + + +### Example 2: Silence a particular vLLM logger + +To silence a particular vLLM logger, it is necessary to provide custom logging +configuration for the target logger that configures the logger so that it won't +propagate its log messages to the root vLLM logger. + +When custom configuration is provided for any logger, it is also necessary to +provide configuration for the root vLLM logger since any custom logger +configuration overrides the built-in default logging configuration used by vLLM. + +First, create an appropriate JSON logging configuration file that includes +configuration for the root vLLM logger and for the logger you wish to silence: + +**/path/to/logging_config.json:** + +```json +{ + "formatters": { + "vllm": { + "class": "vllm.logging.NewLineFormatter", + "datefmt": "%m-%d %H:%M:%S", + "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" + } + }, + "handlers": { + "vllm": { + "class" : "logging.StreamHandler", + "formatter": "vllm", + "level": "INFO", + "stream": "ext://sys.stdout" + } + }, + "loggers": { + "vllm": { + "handlers": ["vllm"], + "level": "DEBUG", + "propagage": false + }, + "vllm.example_noisy_logger": { + "propagate": false + } + }, + "version": 1 +} +``` + +Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set +to the path of the custom logging configuration JSON file: + +```bash +VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \ + vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 +``` + + +### Example 3: Disable vLLM default logging configuration + +To disable vLLM's default logging configuration and silence all vLLM loggers, +simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM +for configuring the root vLLM logger, which in turn, silences all other vLLM +loggers. + +```bash +VLLM_CONFIGURE_LOGGING=0 \ + vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 +``` + + +## Additional resources + +- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details) diff --git a/vllm-v0.6.2/examples/lora_with_quantization_inference.py b/vllm-v0.6.2/examples/lora_with_quantization_inference.py new file mode 100644 index 0000000..0c454ea --- /dev/null +++ b/vllm-v0.6.2/examples/lora_with_quantization_inference.py @@ -0,0 +1,134 @@ +""" +This example shows how to use LoRA with different quantization techniques +for offline inference. + +Requires HuggingFace credentials for access. +""" + +import gc +from typing import List, Optional, Tuple + +import torch +from huggingface_hub import snapshot_download + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest + + +def create_test_prompts( + lora_path: str +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + return [ + # this is an example of using quantization without LoRA + ("My name is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), None), + # the next three examples use quantization with LoRA + ("my name is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), + LoRARequest("lora-test-1", 1, lora_path)), + ("The capital of USA is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), + LoRARequest("lora-test-2", 1, lora_path)), + ("The capital of France is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), + LoRARequest("lora-test-3", 1, lora_path)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params, lora_request = test_prompts.pop(0) + engine.add_request(str(request_id), + prompt, + sampling_params, + lora_request=lora_request) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + for request_output in request_outputs: + if request_output.finished: + print("----------------------------------------------------") + print(f"Prompt: {request_output.prompt}") + print(f"Output: {request_output.outputs[0].text}") + + +def initialize_engine(model: str, quantization: str, + lora_repo: Optional[str]) -> LLMEngine: + """Initialize the LLMEngine.""" + + if quantization == "bitsandbytes": + # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique. + # It quantizes the model when loading, with some config info from the + # LoRA adapter repo. So need to set the parameter of load_format and + # qlora_adapter_name_or_path as below. + engine_args = EngineArgs(model=model, + quantization=quantization, + qlora_adapter_name_or_path=lora_repo, + load_format="bitsandbytes", + enable_lora=True, + max_lora_rank=64) + else: + engine_args = EngineArgs(model=model, + quantization=quantization, + enable_lora=True, + max_loras=4) + return LLMEngine.from_engine_args(engine_args) + + +def main(): + """Main function that sets up and runs the prompt processing.""" + + test_configs = [{ + "name": "qlora_inference_example", + 'model': "huggyllama/llama-7b", + 'quantization': "bitsandbytes", + 'lora_repo': 'timdettmers/qlora-flan-7b' + }, { + "name": "AWQ_inference_with_lora_example", + 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ', + 'quantization': "awq", + 'lora_repo': 'jashing/tinyllama-colorist-lora' + }, { + "name": "GPTQ_inference_with_lora_example", + 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', + 'quantization': "gptq", + 'lora_repo': 'jashing/tinyllama-colorist-lora' + }] + + for test_config in test_configs: + print( + f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~" + ) + engine = initialize_engine(test_config['model'], + test_config['quantization'], + test_config['lora_repo']) + lora_path = snapshot_download(repo_id=test_config['lora_repo']) + test_prompts = create_test_prompts(lora_path) + process_requests(engine, test_prompts) + + # Clean up the GPU memory for the next test + del engine + gc.collect() + torch.cuda.empty_cache() + + +if __name__ == '__main__': + main() diff --git a/vllm-v0.6.2/examples/multilora_inference.py b/vllm-v0.6.2/examples/multilora_inference.py new file mode 100644 index 0000000..d88ea97 --- /dev/null +++ b/vllm-v0.6.2/examples/multilora_inference.py @@ -0,0 +1,106 @@ +""" +This example shows how to use the multi-LoRA functionality +for offline inference. + +Requires HuggingFace credentials for access to Llama2. +""" + +from typing import List, Optional, Tuple + +from huggingface_hub import snapshot_download + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest + + +def create_test_prompts( + lora_path: str +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + """Create a list of test prompts with their sampling parameters. + + 2 requests for base model, 4 requests for the LoRA. We define 2 + different LoRA adapters (using the same model for demo purposes). + Since we also set `max_loras=1`, the expectation is that the requests + with the second LoRA adapter will be ran after all requests with the + first adapter have finished. + """ + return [ + ("A robot may not injure a human being", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), None), + ("To be or not to be,", + SamplingParams(temperature=0.8, + top_k=5, + presence_penalty=0.2, + max_tokens=128), None), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora2", 2, lora_path)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params, lora_request = test_prompts.pop(0) + engine.add_request(str(request_id), + prompt, + sampling_params, + lora_request=lora_request) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + + for request_output in request_outputs: + if request_output.finished: + print(request_output) + + +def initialize_engine() -> LLMEngine: + """Initialize the LLMEngine.""" + # max_loras: controls the number of LoRAs that can be used in the same + # batch. Larger numbers will cause higher memory usage, as each LoRA + # slot requires its own preallocated tensor. + # max_lora_rank: controls the maximum supported rank of all LoRAs. Larger + # numbers will cause higher memory usage. If you know that all LoRAs will + # use the same rank, it is recommended to set this as low as possible. + # max_cpu_loras: controls the size of the CPU LoRA cache. + engine_args = EngineArgs(model="/data/AE/llm/models/Llama-2-7b-hf", + enable_lora=True, + max_loras=1, + max_lora_rank=8, + max_cpu_loras=2, + max_num_seqs=256) + return LLMEngine.from_engine_args(engine_args) + + +def main(): + """Main function that sets up and runs the prompt processing.""" + engine = initialize_engine() + lora_path = "/data/vllm/vLLM_ut_hf_models/yard1/llama-2-7b-sql-lora-test" + test_prompts = create_test_prompts(lora_path) + process_requests(engine, test_prompts) + + +if __name__ == '__main__': + main() diff --git a/vllm-v0.6.2/examples/offline_chat_with_tools.py b/vllm-v0.6.2/examples/offline_chat_with_tools.py new file mode 100644 index 0000000..e69a6c0 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_chat_with_tools.py @@ -0,0 +1,138 @@ +# ruff: noqa +import json +import random +import string + +from vllm import LLM +from vllm.sampling_params import SamplingParams + +# This script is an offline demo for function calling +# +# If you want to run a server/client setup, please follow this code: +# +# - Server: +# +# ```bash +# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral +# ``` +# +# - Client: +# +# ```bash +# curl --location 'http://:8000/v1/chat/completions' \ +# --header 'Content-Type: application/json' \ +# --header 'Authorization: Bearer token' \ +# --data '{ +# "model": "mistralai/Mistral-7B-Instruct-v0.3" +# "messages": [ +# { +# "role": "user", +# "content": [ +# {"type" : "text", "text": "Describe this image in detail please."}, +# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}}, +# {"type" : "text", "text": "and this one as well. Answer in French."}, +# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}} +# ] +# } +# ] +# }' +# ``` +# +# Usage: +# python demo.py simple +# python demo.py advanced + +model_name = "mistralai/Mistral-7B-Instruct-v0.3" +# or switch to "mistralai/Mistral-Nemo-Instruct-2407" +# or "mistralai/Mistral-Large-Instruct-2407" +# or any other mistral model with function calling ability + +sampling_params = SamplingParams(max_tokens=8192, temperature=0.0) +llm = LLM(model=model_name, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral") + + +def generate_random_id(length=9): + characters = string.ascii_letters + string.digits + random_id = ''.join(random.choice(characters) for _ in range(length)) + return random_id + + +# simulate an API that can be called +def get_current_weather(city: str, state: str, unit: 'str'): + return (f"The weather in {city}, {state} is 85 degrees {unit}. It is " + "partly cloudly, with highs in the 90's.") + + +tool_funtions = {"get_current_weather": get_current_weather} + +tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": + "string", + "description": + "The city to find the weather for, e.g. 'San Francisco'" + }, + "state": { + "type": + "string", + "description": + "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'" + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["city", "state", "unit"] + } + } +}] + +messages = [{ + "role": + "user", + "content": + "Can you tell me what the temperate will be in Dallas, in fahrenheit?" +}] + +outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools) +output = outputs[0].outputs[0].text.strip() + +# append the assistant message +messages.append({ + "role": "assistant", + "content": output, +}) + +# let's now actually parse and execute the model's output simulating an API call by using the +# above defined function +tool_calls = json.loads(output) +tool_answers = [ + tool_funtions[call['name']](**call['arguments']) for call in tool_calls +] + +# append the answer as a tool message and let the LLM give you an answer +messages.append({ + "role": "tool", + "content": "\n\n".join(tool_answers), + "tool_call_id": generate_random_id(), +}) + +outputs = llm.chat(messages, sampling_params, tools=tools) + +print(outputs[0].outputs[0].text.strip()) +# yields +# 'The weather in Dallas, TX is 85 degrees fahrenheit. ' +# 'It is partly cloudly, with highs in the 90's.' diff --git a/vllm-v0.6.2/examples/offline_inference.py b/vllm-v0.6.2/examples/offline_inference.py new file mode 100644 index 0000000..d855a28 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference.py @@ -0,0 +1,22 @@ +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="/data/AE/llm/models/Llama-2-7b-hf", enforce_eager=True, dtype='float16') +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/offline_inference_arctic.py b/vllm-v0.6.2/examples/offline_inference_arctic.py new file mode 100644 index 0000000..1fec3c9 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_arctic.py @@ -0,0 +1,26 @@ +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="snowflake/snowflake-arctic-instruct", + quantization="deepspeedfp", + tensor_parallel_size=8, + trust_remote_code=True) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. + +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/offline_inference_audio_language.py b/vllm-v0.6.2/examples/offline_inference_audio_language.py new file mode 100644 index 0000000..050b791 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_audio_language.py @@ -0,0 +1,125 @@ +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on audio language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.utils import FlexibleArgumentParser + +audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +question_per_audio_count = { + 0: "What is 1+1?", + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} + + +# Ultravox 0.3 +def run_ultravox(question: str, audio_count: int): + model_name = "fixie-ai/ultravox-v0_3" + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [{ + 'role': + 'user', + 'content': + "<|reserved_special_token_0|>\n" * audio_count + question + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count}) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Qwen2-Audio +def run_qwen2_audio(question: str, audio_count: int): + model_name = "Qwen/Qwen2-Audio-7B-Instruct" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + audio_in_prompt = "".join([ + f"Audio {idx+1}: " + f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) + ]) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + +model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio} + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + audio_count = args.num_audios + llm, prompt, stop_token_ids = model_example_map[model]( + question_per_audio_count[audio_count], audio_count) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=stop_token_ids) + + mm_data = {} + if audio_count > 0: + mm_data = { + "audio": [ + asset.audio_and_sample_rate + for asset in audio_assets[:audio_count] + ] + } + + assert args.num_prompts > 0 + inputs = {"prompt": prompt, "multi_modal_data": mm_data} + if args.num_prompts > 1: + # Batch inference + inputs = [inputs] * args.num_prompts + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'audio language models') + parser.add_argument('--model-type', + '-m', + type=str, + default="ultravox", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument("--num-audios", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of audio items per prompt.") + + args = parser.parse_args() + main(args) diff --git a/vllm-v0.6.2/examples/offline_inference_beam_search.py b/vllm-v0.6.2/examples/offline_inference_beam_search.py new file mode 100644 index 0000000..7820a0d --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_beam_search.py @@ -0,0 +1,24 @@ +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0, top_p=1, n=4,use_beam_search=True) + +# Create an LLM. +llm = LLM(model="/data/AE/llm/models/Llama-2-7b-hf", enforce_eager=True, dtype='float16') +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + print(f"Prompt: {prompt!r}") + for out_idx in output.outputs: + generated_text = out_idx.text + print(f"Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/offline_inference_chat.py b/vllm-v0.6.2/examples/offline_inference_chat.py new file mode 100644 index 0000000..8814f4d --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_chat.py @@ -0,0 +1,80 @@ +from vllm import LLM, SamplingParams + +llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") +sampling_params = SamplingParams(temperature=0.5) + + +def print_outputs(outputs): + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print("-" * 80) + + +print("=" * 80) + +# In this script, we demonstrate how to pass input to the chat method: + +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] +outputs = llm.chat(conversation, + sampling_params=sampling_params, + use_tqdm=False) +print_outputs(outputs) + +# You can run batch inference with llm.chat API +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] +conversations = [conversation for _ in range(10)] + +# We turn on tqdm progress bar to verify it's indeed running batch inference +outputs = llm.chat(messages=conversations, + sampling_params=sampling_params, + use_tqdm=True) +print_outputs(outputs) + +# A chat template can be optionally supplied. +# If not, the model will use its default chat template. + +# with open('template_falcon_180b.jinja', "r") as f: +# chat_template = f.read() + +# outputs = llm.chat( +# conversations, +# sampling_params=sampling_params, +# use_tqdm=False, +# chat_template=chat_template, +# ) diff --git a/vllm-v0.6.2/examples/offline_inference_distributed.py b/vllm-v0.6.2/examples/offline_inference_distributed.py new file mode 100644 index 0000000..6771278 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_distributed.py @@ -0,0 +1,108 @@ +""" +This example shows how to use Ray Data for running offline batch inference +distributively on a multi-nodes cluster. + +Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html +""" + +from typing import Any, Dict, List + +import numpy as np +import ray +from packaging.version import Version +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +from vllm import LLM, SamplingParams + +assert Version(ray.__version__) >= Version( + "2.22.0"), "Ray version must be at least 2.22.0" + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Set tensor parallelism per instance. +tensor_parallel_size = 1 + +# Set number of instances. Each instance will use tensor_parallel_size GPUs. +num_instances = 1 + + +# Create a class to do batch inference. +class LLMPredictor: + + def __init__(self): + # Create an LLM. + self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + tensor_parallel_size=tensor_parallel_size) + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: + # Generate texts from the prompts. + # The output is a list of RequestOutput objects that contain the prompt, + # generated text, and other information. + outputs = self.llm.generate(batch["text"], sampling_params) + prompt: List[str] = [] + generated_text: List[str] = [] + for output in outputs: + prompt.append(output.prompt) + generated_text.append(' '.join([o.text for o in output.outputs])) + return { + "prompt": prompt, + "generated_text": generated_text, + } + + +# Read one text file from S3. Ray Data supports reading multiple files +# from cloud storage (such as JSONL, Parquet, CSV, binary format). +ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt") + + +# For tensor_parallel_size > 1, we need to create placement groups for vLLM +# to use. Every actor has to have its own placement group. +def scheduling_strategy_fn(): + # One bundle per tensor parallel worker + pg = ray.util.placement_group( + [{ + "GPU": 1, + "CPU": 1 + }] * tensor_parallel_size, + strategy="STRICT_PACK", + ) + return dict(scheduling_strategy=PlacementGroupSchedulingStrategy( + pg, placement_group_capture_child_tasks=True)) + + +resources_kwarg: Dict[str, Any] = {} +if tensor_parallel_size == 1: + # For tensor_parallel_size == 1, we simply set num_gpus=1. + resources_kwarg["num_gpus"] = 1 +else: + # Otherwise, we have to set num_gpus=0 and provide + # a function that will create a placement group for + # each instance. + resources_kwarg["num_gpus"] = 0 + resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn + +# Apply batch inference for all input data. +ds = ds.map_batches( + LLMPredictor, + # Set the concurrency to the number of LLM instances. + concurrency=num_instances, + # Specify the batch size for inference. + batch_size=32, + **resources_kwarg, +) + +# Peek first 10 results. +# NOTE: This is for local testing and debugging. For production use case, +# one should write full result out as shown below. +outputs = ds.take(limit=10) +for output in outputs: + prompt = output["prompt"] + generated_text = output["generated_text"] + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +# Write inference output data out as Parquet files to S3. +# Multiple files would be written to the output destination, +# and each task would write one or more files separately. +# +# ds.write_parquet("s3://") diff --git a/vllm-v0.6.2/examples/offline_inference_embedding.py b/vllm-v0.6.2/examples/offline_inference_embedding.py new file mode 100644 index 0000000..7d5ef12 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_embedding.py @@ -0,0 +1,17 @@ +from vllm import LLM + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create an LLM. +model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) +# Generate embedding. The output is a list of EmbeddingRequestOutputs. +outputs = model.encode(prompts) +# Print the outputs. +for output in outputs: + print(output.outputs.embedding) # list of 4096 floats diff --git a/vllm-v0.6.2/examples/offline_inference_encoder_decoder.py b/vllm-v0.6.2/examples/offline_inference_encoder_decoder.py new file mode 100644 index 0000000..0f266d7 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_encoder_decoder.py @@ -0,0 +1,99 @@ +''' +Demonstrate prompting of text-to-text +encoder/decoder models, specifically BART +''' + +from vllm import LLM, SamplingParams +from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, + TokensPrompt, zip_enc_dec_prompts) + +dtype = "float" + +# Create a BART encoder/decoder model instance +llm = LLM( + model="facebook/bart-large-cnn", + dtype=dtype, +) + +# Get BART tokenizer +tokenizer = llm.llm_engine.get_tokenizer_group() + +# Test prompts +# +# This section shows all of the valid ways to prompt an +# encoder/decoder model. +# +# - Helpers for building prompts +text_prompt_raw = "Hello, my name is" +text_prompt = TextPrompt(prompt="The president of the United States is") +tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode( + prompt="The capital of France is")) +# - Pass a single prompt to encoder/decoder model +# (implicitly encoder input prompt); +# decoder input prompt is assumed to be None + +single_text_prompt_raw = text_prompt_raw # Pass a string directly +single_text_prompt = text_prompt # Pass a TextPrompt +single_tokens_prompt = tokens_prompt # Pass a TokensPrompt + +# - Pass explicit encoder and decoder input prompts within one data structure. +# Encoder and decoder prompts can both independently be text or tokens, with +# no requirement that they be the same prompt type. Some example prompt-type +# combinations are shown below, note that these are not exhaustive. + +enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( + # Pass encoder prompt string directly, & + # pass decoder prompt tokens + encoder_prompt=single_text_prompt_raw, + decoder_prompt=single_tokens_prompt, +) +enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( + # Pass TextPrompt to encoder, and + # pass decoder prompt string directly + encoder_prompt=single_text_prompt, + decoder_prompt=single_text_prompt_raw, +) +enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( + # Pass encoder prompt tokens directly, and + # pass TextPrompt to decoder + encoder_prompt=single_tokens_prompt, + decoder_prompt=single_text_prompt, +) + +# - Finally, here's a useful helper function for zipping encoder and +# decoder prompts together into a list of ExplicitEncoderDecoderPrompt +# instances +zipped_prompt_list = zip_enc_dec_prompts( + ['An encoder prompt', 'Another encoder prompt'], + ['A decoder prompt', 'Another decoder prompt']) + +# - Let's put all of the above example prompts together into one list +# which we will pass to the encoder/decoder LLM. +prompts = [ + single_text_prompt_raw, single_text_prompt, single_tokens_prompt, + enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3 +] + zipped_prompt_list + +print(prompts) + +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, +) + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/offline_inference_mlpspeculator.py b/vllm-v0.6.2/examples/offline_inference_mlpspeculator.py new file mode 100644 index 0000000..b8a3875 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_mlpspeculator.py @@ -0,0 +1,59 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + +import torch + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs])) + # Print the outputs. + for output in outputs: + generated_text = output.outputs[0].text + print(f"text: {generated_text!r}") + + +if __name__ == "__main__": + + template = ( + "Below is an instruction that describes a task. Write a response " + "that appropriately completes the request.\n\n### Instruction:\n{}" + "\n\n### Response:\n") + + # Sample prompts. + prompts = [ + "Write about the president of the United States.", + ] + prompts = [template.format(prompt) for prompt in prompts] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=200) + + # Create an LLM without spec decoding + llm = LLM(model="/data/AE/llm/models/Llama-2-13b-chat-hf/") + + print("Without speculation") + time_generation(llm, prompts, sampling_params) + + del llm + gc.collect() + torch.mlu.empty_cache() + + # Create an LLM with spec decoding + llm = LLM( + model="/data/AE/llm/models/Llama-2-13b-chat-hf/", + speculative_model="/data/vllm/vLLM_ut_hf_models/ibm-fms/llama-13b-accelerator", + ) + + print("With speculation") + time_generation(llm, prompts, sampling_params) diff --git a/vllm-v0.6.2/examples/offline_inference_neuron.py b/vllm-v0.6.2/examples/offline_inference_neuron.py new file mode 100644 index 0000000..2856be7 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_neuron.py @@ -0,0 +1,43 @@ +import os + +from vllm import LLM, SamplingParams + +# creates XLA hlo graphs for all the context length buckets. +os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" +# creates XLA hlo graphs for all the token gen buckets. +os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=2048, + block_size=2048, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + tensor_parallel_size=2) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/offline_inference_neuron_int8_quantization.py b/vllm-v0.6.2/examples/offline_inference_neuron_int8_quantization.py new file mode 100644 index 0000000..8ec17e3 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_neuron_int8_quantization.py @@ -0,0 +1,50 @@ +import os + +from vllm import LLM, SamplingParams + +# creates XLA hlo graphs for all the context length buckets. +os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" +# creates XLA hlo graphs for all the token gen buckets. +os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" +# Quantizes neuron model weight to int8 , +# The default config for quantization is int8 dtype. +os.environ['NEURON_QUANT_DTYPE'] = "s8" + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=2048, + block_size=2048, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + quantization="neuron_quant", + override_neuron_config={ + "cast_logits_dtype": "bfloat16", + }, + tensor_parallel_size=2) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm-v0.6.2/examples/offline_inference_openai.md b/vllm-v0.6.2/examples/offline_inference_openai.md new file mode 100644 index 0000000..4c64197 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_openai.md @@ -0,0 +1,205 @@ +# Offline Inference with the OpenAI Batch file format + + **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API. + + ## File Format + + The OpenAI batch file format consists of a series of json objects on new lines. + + [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) + + Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. + + **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). + + ## Pre-requisites + +* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`. +* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`. + - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) + - Install the token on your machine (Run `huggingface-cli login`). + - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. + + + ## Example 1: Running with a local file + + ### Step 1: Create your batch file + + To follow along with this example, you can download the example batch, or create your own batch file in your working directory. + + ``` + wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl + ``` + + Once you've created your batch file it should look like this + + ``` + $ cat openai_example_batch.jsonl +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} + ``` + + ### Step 2: Run the batch + +The batch running tool is designed to be used from the command line. + +You can run the batch with the following command, which will write its results to a file called `results.jsonl` + +``` +python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +### Step 3: Check your results + +You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl` + +``` +$ cat results.jsonl +{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null} +{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null} +``` + +## Example 2: Using remote files + +The batch runner supports remote input and output urls that are accessible via http/https. + +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run + +``` +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +## Example 3: Integrating with AWS S3 + +To integrate with cloud blob storage, we recommend using presigned urls. + +[Learn more about S3 presigned urls here] + +### Additional prerequisites + +* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). +* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3. + - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html). +* The `boto3` python package (Run `pip install boto3`) to generate presigned urls. + +### Step 1: Upload your input script + +To follow along with this example, you can download the example batch, or create your own batch file in your working directory. + + ``` + wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl + ``` + + Once you've created your batch file it should look like this + + ``` + $ cat openai_example_batch.jsonl +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} + ``` + +Now upload your batch file to your S3 bucket. + +``` +aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +``` + + +### Step 2: Generate your presigned urls + +Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names. + +(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py) + +``` +import boto3 +from botocore.exceptions import ClientError + +def generate_presigned_url(s3_client, client_method, method_parameters, expires_in): + """ + Generate a presigned Amazon S3 URL that can be used to perform an action. + + :param s3_client: A Boto3 Amazon S3 client. + :param client_method: The name of the client method that the URL performs. + :param method_parameters: The parameters of the specified client method. + :param expires_in: The number of seconds the presigned URL is valid for. + :return: The presigned URL. + """ + try: + url = s3_client.generate_presigned_url( + ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in + ) + except ClientError: + raise + return url + + +s3_client = boto3.client("s3") +input_url = generate_presigned_url( + s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 +) +output_url = generate_presigned_url( + s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 +) +print(f"{input_url=}") +print(f"{output_url=}") +``` + +This script should output + +``` +input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091' +output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091' +``` + +### Step 3: Run the batch runner using your presigned urls + +You can now run the batch runner, using the urls generated in the previous section. + +``` +python -m vllm.entrypoints.openai.run_batch \ + -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + --model --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +### Step 4: View your results + +Your results are now on S3. You can view them in your terminal by running + +``` +aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - +``` + +## Example 4: Using embeddings endpoint + +### Additional prerequisites + +* Ensure you are using `vllm >= 0.5.5`. + +### Step 1: Create your batch file + + Add embedding requests to your batch file. The following is an example: + + ``` + {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}} +``` + + You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model). + + + ### Step 2: Run the batch + +You can run the batch using the same command as in earlier examples. + + +### Step 3: Check your results + +You can check your results by running `cat results.jsonl` + +``` +$ cat results.jsonl +{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} +...``` +``` diff --git a/vllm-v0.6.2/examples/offline_inference_pixtral.py b/vllm-v0.6.2/examples/offline_inference_pixtral.py new file mode 100644 index 0000000..c12ff70 --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_pixtral.py @@ -0,0 +1,165 @@ +# ruff: noqa +import argparse + +from vllm import LLM +from vllm.sampling_params import SamplingParams + +# This script is an offline demo for running Pixtral. +# +# If you want to run a server/client setup, please follow this code: +# +# - Server: +# +# ```bash +# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384 +# ``` +# +# - Client: +# +# ```bash +# curl --location 'http://:8000/v1/chat/completions' \ +# --header 'Content-Type: application/json' \ +# --header 'Authorization: Bearer token' \ +# --data '{ +# "model": "mistralai/Pixtral-12B-2409", +# "messages": [ +# { +# "role": "user", +# "content": [ +# {"type" : "text", "text": "Describe this image in detail please."}, +# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}}, +# {"type" : "text", "text": "and this one as well. Answer in French."}, +# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}} +# ] +# } +# ] +# }' +# ``` +# +# Usage: +# python demo.py simple +# python demo.py advanced + + +def run_simple_demo(): + model_name = "mistralai/Pixtral-12B-2409" + sampling_params = SamplingParams(max_tokens=8192) + + # Lower max_num_seqs or max_model_len on low-VRAM GPUs. + llm = LLM(model=model_name, tokenizer_mode="mistral") + + prompt = "Describe this image in one sentence." + image_url = "https://picsum.photos/id/237/200/300" + + messages = [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": prompt + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + ], + }, + ] + outputs = llm.chat(messages, sampling_params=sampling_params) + + print(outputs[0].outputs[0].text) + + +def run_advanced_demo(): + model_name = "mistralai/Pixtral-12B-2409" + max_img_per_msg = 5 + max_tokens_per_img = 4096 + + sampling_params = SamplingParams(max_tokens=8192, temperature=0.7) + llm = LLM( + model=model_name, + tokenizer_mode="mistral", + limit_mm_per_prompt={"image": max_img_per_msg}, + max_model_len=max_img_per_msg * max_tokens_per_img, + ) + + prompt = "Describe the following image." + + url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png" + url_2 = "https://picsum.photos/seed/picsum/200/300" + url_3 = "https://picsum.photos/id/32/512/512" + + messages = [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": prompt + }, + { + "type": "image_url", + "image_url": { + "url": url_1 + } + }, + { + "type": "image_url", + "image_url": { + "url": url_2 + } + }, + ], + }, + { + "role": "assistant", + "content": "The images show nature.", + }, + { + "role": "user", + "content": "More details please and answer only in French!.", + }, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": url_3 + } + }, + ], + }, + ] + + outputs = llm.chat(messages=messages, sampling_params=sampling_params) + print(outputs[0].outputs[0].text) + + +def main(): + parser = argparse.ArgumentParser( + description="Run a demo in simple or advanced mode.") + + parser.add_argument( + "mode", + choices=["simple", "advanced"], + help="Specify the demo mode: 'simple' or 'advanced'", + ) + + args = parser.parse_args() + + if args.mode == "simple": + print("Running simple demo...") + run_simple_demo() + elif args.mode == "advanced": + print("Running advanced demo...") + run_advanced_demo() + + +if __name__ == "__main__": + main() diff --git a/vllm-v0.6.2/examples/offline_inference_tpu.py b/vllm-v0.6.2/examples/offline_inference_tpu.py new file mode 100644 index 0000000..251629b --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_tpu.py @@ -0,0 +1,28 @@ +from vllm import LLM, SamplingParams + +prompts = [ + "A robot may not injure a human being", + "It is only with the heart that one can see rightly;", + "The greatest glory in living lies not in never falling,", +] +answers = [ + " or, through inaction, allow a human being to come to harm.", + " what is essential is invisible to the eye.", + " but in rising every time we fall.", +] +N = 1 +# Currently, top-p sampling is disabled. `top_p` should be 1.0. +sampling_params = SamplingParams(temperature=0.7, + top_p=1.0, + n=N, + max_tokens=16) + +# Set `enforce_eager=True` to avoid ahead-of-time compilation. +# In real workloads, `enforace_eager` should be `False`. +llm = LLM(model="google/gemma-2b", enforce_eager=True) +outputs = llm.generate(prompts, sampling_params) +for output, answer in zip(outputs, answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text.startswith(answer) diff --git a/vllm-v0.6.2/examples/offline_inference_vision_language.py b/vllm-v0.6.2/examples/offline_inference_vision_language.py new file mode 100644 index 0000000..07b1eaa --- /dev/null +++ b/vllm-v0.6.2/examples/offline_inference_vision_language.py @@ -0,0 +1,537 @@ +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.utils import FlexibleArgumentParser +from vllm_mlu._mlu_utils import USE_PAGED + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + + +# LLaVA-1.5 +def run_llava(question: str, modality: str): + assert modality == "image" + + prompt = f"USER: \n{question}\nASSISTANT:" + + llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# LLaVA-1.6/LLaVA-NeXT +def run_llava_next(question: str, modality: str): + assert modality == "image" + + prompt = f"[INST] \n{question} [/INST]" + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# LlaVA-NeXT-Video +# Currently only support for video input +def run_llava_next_video(question: str, modality: str): + assert modality == "video" + + prompt = f"USER: