feature: add
This commit is contained in:
69
docker-images/mlu370_launch_service
Normal file
69
docker-images/mlu370_launch_service
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
cat /proc/cpuinfo | tail -n 50
|
||||||
|
cnmon
|
||||||
|
unset CUDA_VISIBLE_DEVICES
|
||||||
|
export
|
||||||
|
date
|
||||||
|
|
||||||
|
DEFAULT_HOST="0.0.0.0"
|
||||||
|
DEFAULT_PORT="8000"
|
||||||
|
DEFAULT_SERVED_MODEL_NAME="llm"
|
||||||
|
DEFAULT_MODEL_PATH="/model"
|
||||||
|
DEFAULT_MAX_MODEL_LEN="8192"
|
||||||
|
DEFAULT_TENSOR_PARALLEL_SIZE="1"
|
||||||
|
DEFAULT_MAX_NUM_SEQS="64"
|
||||||
|
DEFAULT_ENFORCE_EAGER="true"
|
||||||
|
DEFAULT_DISABLE_LOG_REQUESTS="true"
|
||||||
|
DEFAULT_PREFIX_CACHING="false"
|
||||||
|
|
||||||
|
HOST_VAL=${HOST:-$DEFAULT_HOST}
|
||||||
|
PORT_VAL=${PORT:-$DEFAULT_PORT}
|
||||||
|
SERVED_MODEL_NAME_VAL=${SERVED_MODEL_NAME:-$DEFAULT_SERVED_MODEL_NAME}
|
||||||
|
MODEL_PATH_VAL=${MODEL_PATH:-$DEFAULT_MODEL_PATH}
|
||||||
|
MAX_MODEL_LEN_VAL=${MAX_MODEL_LEN:-$DEFAULT_MAX_MODEL_LEN}
|
||||||
|
TENSOR_PARALLEL_SIZE_VAL=${TENSOR_PARALLEL_SIZE:-$DEFAULT_TENSOR_PARALLEL_SIZE}
|
||||||
|
MAX_NUM_SEQS_VAL=${MAX_NUM_SEQS:-$DEFAULT_MAX_NUM_SEQS}
|
||||||
|
INCLUDE_ENFORCE_EAGER_FLAG=${ENFORCE_EAGER:-$DEFAULT_ENFORCE_EAGER}
|
||||||
|
INCLUDE_DISABLE_LOG_REQUESTS_FLAG=${DISABLE_LOG_REQUESTS:-$DEFAULT_DISABLE_LOG_REQUESTS}
|
||||||
|
INCLUDE_PREFIX_CACHING_FLAG=${PREFIX_CACHING:-$DEFAULT_PREFIX_CACHING}
|
||||||
|
|
||||||
|
CMD_ARGS=()
|
||||||
|
CMD_ARGS+=(--host "$HOST_VAL")
|
||||||
|
CMD_ARGS+=(--port "$PORT_VAL")
|
||||||
|
|
||||||
|
if [[ "$INCLUDE_ENFORCE_EAGER_FLAG" != "false" && "$INCLUDE_ENFORCE_EAGER_FLAG" != "0" ]]; then
|
||||||
|
CMD_ARGS+=(--enforce-eager)
|
||||||
|
fi
|
||||||
|
if [[ "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "false" && "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "0" ]]; then
|
||||||
|
CMD_ARGS+=(--disable-log-requests)
|
||||||
|
fi
|
||||||
|
if [[ "$INCLUDE_PREFIX_CACHING_FLAG" != "false" && "$INCLUDE_PREFIX_CACHING_FLAG" != "0" ]]; then
|
||||||
|
CMD_ARGS+=(--enable-prefix-caching)
|
||||||
|
fi
|
||||||
|
|
||||||
|
CMD_ARGS+=(--served-model-name "$SERVED_MODEL_NAME_VAL")
|
||||||
|
CMD_ARGS+=(--model "$MODEL_PATH_VAL")
|
||||||
|
CMD_ARGS+=(--max-model-len "$MAX_MODEL_LEN_VAL")
|
||||||
|
CMD_ARGS+=(--tensor-parallel-size "$TENSOR_PARALLEL_SIZE_VAL")
|
||||||
|
CMD_ARGS+=(--max-num-seqs "$MAX_NUM_SEQS_VAL")
|
||||||
|
CMD_ARGS+=(--trust-remote-code --dtype float16 --block-size "$MAX_MODEL_LEN_VAL")
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
echo "Starting VLLM OpenAI API Server..."
|
||||||
|
echo "Using effective arguments:"
|
||||||
|
echo " Host (--host): $HOST_VAL"
|
||||||
|
echo " Port (--port): $PORT_VAL"
|
||||||
|
echo " Enforce Eager (--enforce-eager):" $([[ "$INCLUDE_ENFORCE_EAGER_FLAG" != "false" && "$INCLUDE_ENFORCE_EAGER_FLAG" != "0" ]] && echo "Enabled" || echo "Disabled (Env: ENFORCE_EAGER=$ENFORCE_EAGER)")
|
||||||
|
echo " Disable Log Req (--disable-log-requests):" $([[ "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "false" && "$INCLUDE_DISABLE_LOG_REQUESTS_FLAG" != "0" ]] && echo "Enabled" || echo "Disabled (Env: DISABLE_LOG_REQUESTS=$DISABLE_LOG_REQUESTS)")
|
||||||
|
echo " Served Model Name (--served-model-name): $SERVED_MODEL_NAME_VAL"
|
||||||
|
echo " Model Path (--model): $MODEL_PATH_VAL"
|
||||||
|
echo " Max Model Length (--max-model-len): $MAX_MODEL_LEN_VAL"
|
||||||
|
echo " Tensor Parallel Size (--tensor-parallel-size): $TENSOR_PARALLEL_SIZE_VAL"
|
||||||
|
echo " Max Num Seqs (--max-num-seqs): $MAX_NUM_SEQS_VAL"
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
echo "Full cmd:"
|
||||||
|
echo "python3 -m vllm.entrypoints.openai.api_server ${CMD_ARGS[*]}"
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server "${CMD_ARGS[@]}"
|
||||||
|
|
||||||
15
docker-images/template.jinja
Normal file
15
docker-images/template.jinja
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{% for m in messages %}
|
||||||
|
{% if m['role'] == 'system' %}{{ bos_token }}<|system|>
|
||||||
|
{{ m['content'] }}
|
||||||
|
<|end|>
|
||||||
|
{% elif m['role'] == 'user' %}{{ bos_token }}<|user|>
|
||||||
|
{{ m['content'] }}
|
||||||
|
<|end|>
|
||||||
|
{% elif m['role'] == 'assistant' %}{{ bos_token }}<|assistant|>
|
||||||
|
{{ m['content'] }}
|
||||||
|
<|end|>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% if add_generation_prompt %}{{ bos_token }}<|assistant|>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
Reference in New Issue
Block a user