diff --git a/docker-images/mlu370_launch_service b/docker-images/mlu370_launch_service new file mode 100644 index 0000000..c459fb3 --- /dev/null +++ b/docker-images/mlu370_launch_service @@ -0,0 +1,69 @@ +#!/bin/bash + +cat /proc/cpuinfo | tail -n 50 +cnmon +unset CUDA_VISIBLE_DEVICES +export +date + +DEFAULT_HOST="0.0.0.0" +DEFAULT_PORT="8000" +DEFAULT_SERVED_MODEL_NAME="llm" +DEFAULT_MODEL_PATH="/model" +DEFAULT_MAX_MODEL_LEN="8192" +DEFAULT_TENSOR_PARALLEL_SIZE="1" +DEFAULT_MAX_NUM_SEQS="64" +DEFAULT_ENFORCE_EAGER="true" +DEFAULT_DISABLE_LOG_REQUESTS="true" +DEFAULT_PREFIX_CACHING="false" + +HOST_VAL=${HOST:-$DEFAULT_HOST} +PORT_VAL=${PORT:-$DEFAULT_PORT} +SERVED_MODEL_NAME_VAL=${SERVED_MODEL_NAME:-$DEFAULT_SERVED_MODEL_NAME} +MODEL_PATH_VAL=${MODEL_PATH:-$DEFAULT_MODEL_PATH} +MAX_MODEL_LEN_VAL=${MAX_MODEL_LEN:-$DEFAULT_MAX_MODEL_LEN} +TENSOR_PARALLEL_SIZE_VAL=${TENSOR_PARALLEL_SIZE:-$DEFAULT_TENSOR_PARALLEL_SIZE} +MAX_NUM_SEQS_VAL=${MAX_NUM_SEQS:-$DEFAULT_MAX_NUM_SEQS} +INCLUDE_ENFORCE_EAGER_FLAG=${ENFORCE_EAGER:-$DEFAULT_ENFORCE_EAGER} +INCLUDE_DISABLE_LOG_REQUESTS_FLAG=${DISABLE_LOG_REQUESTS:-$DEFAULT_DISABLE_LOG_REQUESTS} +INCLUDE_PREFIX_CACHING_FLAG=${PREFIX_CACHING:-$DEFAULT_PREFIX_CACHING} + +CMD_ARGS=() +CMD_ARGS+=(--host "$HOST_VAL") +CMD_ARGS+=(--port "$PORT_VAL") + +if [[ "$INCLUDE_ENFORCE_EAGER_FLAG" != "false" && "$INCLUDE_ENFORCE_EAGER_FLAG" != "0" ]]; then + CMD_ARGS+=(--enforce-eager) +fi +if [[ "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "false" && "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "0" ]]; then + CMD_ARGS+=(--disable-log-requests) +fi +if [[ "$INCLUDE_PREFIX_CACHING_FLAG" != "false" && "$INCLUDE_PREFIX_CACHING_FLAG" != "0" ]]; then + CMD_ARGS+=(--enable-prefix-caching) +fi + +CMD_ARGS+=(--served-model-name "$SERVED_MODEL_NAME_VAL") +CMD_ARGS+=(--model "$MODEL_PATH_VAL") +CMD_ARGS+=(--max-model-len "$MAX_MODEL_LEN_VAL") +CMD_ARGS+=(--tensor-parallel-size "$TENSOR_PARALLEL_SIZE_VAL") +CMD_ARGS+=(--max-num-seqs "$MAX_NUM_SEQS_VAL") +CMD_ARGS+=(--trust-remote-code --dtype float16 --block-size "$MAX_MODEL_LEN_VAL") +echo "--------------------------------------------------" +echo "Starting VLLM OpenAI API Server..." +echo "Using effective arguments:" +echo " Host (--host): $HOST_VAL" +echo " Port (--port): $PORT_VAL" +echo " Enforce Eager (--enforce-eager):" $([[ "$INCLUDE_ENFORCE_EAGER_FLAG" != "false" && "$INCLUDE_ENFORCE_EAGER_FLAG" != "0" ]] && echo "Enabled" || echo "Disabled (Env: ENFORCE_EAGER=$ENFORCE_EAGER)") +echo " Disable Log Req (--disable-log-requests):" $([[ "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "false" && "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "0" ]] && echo "Enabled" || echo "Disabled (Env: DISABLE_LOG_REQUESTS=$DISABLE_LOG_REQUESTS)") +echo " Served Model Name (--served-model-name): $SERVED_MODEL_NAME_VAL" +echo " Model Path (--model): $MODEL_PATH_VAL" +echo " Max Model Length (--max-model-len): $MAX_MODEL_LEN_VAL" +echo " Tensor Parallel Size (--tensor-parallel-size): $TENSOR_PARALLEL_SIZE_VAL" +echo " Max Num Seqs (--max-num-seqs): $MAX_NUM_SEQS_VAL" +echo "--------------------------------------------------" +echo "Full cmd:" +echo "python3 -m vllm.entrypoints.openai.api_server ${CMD_ARGS[*]}" +echo "--------------------------------------------------" + +python3 -m vllm.entrypoints.openai.api_server "${CMD_ARGS[@]}" + diff --git a/docker-images/template.jinja b/docker-images/template.jinja new file mode 100644 index 0000000..d307227 --- /dev/null +++ b/docker-images/template.jinja @@ -0,0 +1,15 @@ +{% for m in messages %} +{% if m['role'] == 'system' %}{{ bos_token }}<|system|> +{{ m['content'] }} +<|end|> +{% elif m['role'] == 'user' %}{{ bos_token }}<|user|> +{{ m['content'] }} +<|end|> +{% elif m['role'] == 'assistant' %}{{ bos_token }}<|assistant|> +{{ m['content'] }} +<|end|> +{% endif %} +{% endfor %} +{% if add_generation_prompt %}{{ bos_token }}<|assistant|> +{% endif %} +