enginex-mlu370-vllm/vllm-v0.6.2/tests/mlu_cases_list.sh

#!/bin/bash

LINK_MODELS() {
    mkdir -p meta-llama openai-community Qwen NousResearch mistralai THUDM baichuan-inc llava-hf

    ln -s /data/AE/llm/models/Llama-2-7b meta-llama/
    ln -s /data/AE/llm/models/Llama-2-7b-hf meta-llama/
    ln -s /data/AE/llm/models/Llama-2-7b-hf meta-llama/llama-2-7b-hf
    ln -s /data/AE/llm/models/Llama-2-7b-chat-hf meta-llama/Llama-2-7b-chat-hf
    ln -s /data/AE/llm/models/Llama-2-13b-chat-hf meta-llama/Llama-2-13b-chat-hf
    ln -s /data/AE/llm/models/Meta-Llama-3-8B meta-llama/
    ln -s /data/AE/llm/models/Meta-Llama-3-8B-Instruct meta-llama/
    ln -s /data/vllm/models/LLM-Research/Llama-3.2-1B-Instruct meta-llama/

    ln -s /data/AE/llm/models/Meta-Llama-3-8B-Instruct NousResearch/

    ln -s /data/AE/llm/models/Qwen1.5-7B Qwen/
    ln -s /data/AE/llm/models/Qwen2-7B-Instruct Qwen/

    ln -s /data/AE/llm/models/Mistral-7B-v0.1 mistralai/
    ln -s /data/vllm/models/LLM-Research/Mixtral-8x7B-Instruct-v0.1 mistralai/

    ln -s /data/AE/llm/models/chatglm3-6b THUDM/

    ln -s /data/vllm/models/LLM-Research/Baichuan-7B baichuan-inc/

    ln -s /data/vllm/vLLM_ut_hf_models/gpt2 openai-community/

    ln -s /data/AE/llm/models/llava-1.5-7b-hf llava-hf/

    LOCAL_MODEL=`ls /data/vllm/vLLM_ut_hf_models/`
    ln -s /data/vllm/vLLM_ut_hf_models/* .

    # create huggingface cache dir if not exists
    mkdir -p ~/.cache/huggingface/datasets
    ln -s /data/vllm/vLLM_ut_hf_models/gsm8k/ ~/.cache/huggingface/datasets/gsm8k
}


UNLINK_MODELS() {
    LOCAL_MODEL=`ls /data/vllm/vLLM_ut_hf_models/`
    rm -rf meta-llama openai-community Qwen NousResearch mistralai THUDM baichuan-inc llava-hf ${LOCAL_MODEL}
    rm -rf ~/.cache/huggingface/datasets
}

### async_engine ###
ASYNC_ENGINE_CASES=(
    async_engine/test_api_server.py
    async_engine/test_async_llm_engine.py
    async_engine/test_openapi_server.py
    async_engine/test_request_tracker.py
)

### basic_correctness ###
# Skip cases
#   - test_chunked_prefill.py::test_models_with_fp8_kv_cache
#   - test_basic_correctness::test_model_with_failure
# before test: export VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
# after test: unset VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT
BASIC_CORRECTNESS_CASES=(
    basic_correctness/test_basic_correctness.py::test_vllm_gc_ed
    basic_correctness/test_basic_correctness.py::test_models
    basic_correctness/test_basic_correctness.py::test_models_distributed
    basic_correctness/test_chunked_prefill.py::test_models_distributed
    basic_correctness/test_chunked_prefill.py::test_models
    basic_correctness/test_chunked_prefill.py::test_with_prefix_caching
    basic_correctness/test_cpu_offload.py
    basic_correctness/test_preemption.py::test_chunked_prefill_recompute
    basic_correctness/test_preemption.py::test_preemption
    basic_correctness/test_preemption.py::test_preemption_infeasible
)

### benchmark ###
# before test: export VLLM_LATENCY_DEBUG=1
# after test: unset VLLM_LATENCY_DEBUG
BENCHMARK_CASES=(
    benchmark/test_benchmark_latency.py
)

### compile ###
# FIXME: Pytorch 2.4 not support torch.compile, skip vllm compile cases.
# Add this back when upgrade pytorch to 2.5.
# COMPILE_CASES=(
#     compile/test_full_graph.py
#     compile/test_wrapper.py
# )

### core ###
CORE_CASES=(
    core/test_chunked_prefill_scheduler.py
    core/test_num_computed_tokens_update.py
    core/test_scheduler_encoder_decoder.py
    core/test_scheduler.py
    core/test_serialization.py
    core/block/test_block_manager.py
    core/block/test_block_table.py
    core/block/test_common.py
    core/block/test_cpu_gpu_block_allocator.py
    core/block/test_naive_block.py
    core/block/test_prefix_caching_block.py
    core/block/e2e/test_correctness.py::test_block_manager_with_preemption
    core/block/e2e/test_correctness.py::test_lookahead_greedy_equality_with_preemption
    core/block/e2e/test_correctness.py::test_chunked_prefill_block_manager
    core/block/e2e/test_correctness.py::test_block_manager_prefix_caching_enabled_with_preemption
    core/block/e2e/test_correctness.py::test_auto_prefix_caching_with_preemption
    core/block/e2e/test_correctness.py::test_auto_prefix_caching_after_evition_start
    core/block/e2e/test_correctness_sliding_window.py
)

### distributed ###
# Skip cases
#   - test_custom_all_reduce.py
#   - test_distributed_oot.py
#   - test_multi_node_assignment.py
#   - test_same_node.py
DISTRIBUTED_CASES=(
    distributed/test_pipeline_parallel.py
    distributed/test_pipeline_partition.py
    distributed/test_pp_cudagraph.py
    distributed/test_shm_broadcast.py
    distributed/test_utils.py
)
# before test: UNLINK_MODELS
# after test: LINK_MODELS
DISTRIBUTED_NEED_PACK_CASES=(
    distributed/test_comm_ops.py
)

### engine ###
ENGINE_CASES=(
    engine/test_arg_utils.py
    engine/test_computed_prefix_blocks.py
    engine/test_custom_executor.py
    engine/test_detokenization.py
    engine/test_multiproc_workers.py
    engine/test_short_mm_context.py
    engine/test_skip_tokenizer_init.py
    engine/test_stop_reason.py
    engine/test_stop_strings.py
    engine/output_processor/test_multi_step.py
    engine/output_processor/test_stop_checker.py
)

### entrypoints ###
# Skip cases
#   - entrypoints/llm/test_encode.py
#   - entrypoints/llm/test_generate_multiple_loras.py
#   - entrypoints/openai/test_accuracy.py
#   - entrypoints/openai/test_audio.py
#   - entrypoints/openai/test_chat.py
#   - entrypoints/openai/test_completion.py
#   - entrypoints/openai/test_embedding.py
#   - entrypoints/openai/test_encoder_decoder.py
#   - entrypoints/openai/test_metrics.py
#   - entrypoints/openai/test_models.py
#   - entrypoints/openai/test_oot_registration.py
#   - entrypoints/openai/test_return_tokens_as_ids.py
#   - entrypoints/openai/test_shutdown.py
#   - entrypoints/openai/test_tokenization.py
#   - entrypoints/openai/test_vision.py
#   - entrypoints/openai/test_run_batch.py::test_embeddings
ENTRYPOINTS_CASES=(
    entrypoints/test_chat_utils.py
    entrypoints/llm/test_chat.py
    entrypoints/llm/test_generate.py
    entrypoints/llm/test_guided_generate.py
    entrypoints/llm/test_lazy_outlines.py
    entrypoints/llm/test_prompt_validation.py
    entrypoints/offline_mode/test_offline_mode.py
    entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
    entrypoints/openai/test_basic.py
    entrypoints/openai/test_chat_template.py
    entrypoints/openai/test_chunked_prompt.py
    entrypoints/openai/test_cli_args.py
    entrypoints/openai/test_prompt_validation.py
    entrypoints/openai/test_run_batch.py::test_empty_file
    entrypoints/openai/test_run_batch.py::test_completions
    entrypoints/openai/test_run_batch.py::test_completions_invalid_input
    entrypoints/openai/test_serving_chat.py
    entrypoints/openai/test_serving_engine.py
)

### kernels ###
# Skip cases
#   - other op/layer test
KERNELS_CASES=(
    kernels/bt_torch_ops
    kernels/test_advance_step.py
    kernels/test_feed_forward.py
)

### lora ###
# triton kernel tests are in TRITON_CASE
#   - lora/test_llama.py::test_llama_lora_warmup
#   - lora/test_tokenizer_group.py
# NOTE: The following tests requires 4 gpus, which can not run
#   in ci environment. We should check these tests after we do
#   some modifications for lora.
# lora/test_long_context.py::test_batched_rope_kernel
# lora/test_long_context.py::test_self_consistency
# lora/test_long_context.py::test_quality
# lora/test_long_context.py::test_max_len
LORA_CASES=(
    lora/test_layers.py
    lora/test_lora_checkpoints.py
    lora/test_lora_huggingface.py
    lora/test_lora_manager.py
    lora/test_utils.py
    lora/test_worker.py
    lora/test_baichuan.py
    lora/test_chatglm3.py
    lora/test_llama.py::test_llama_lora[1]
    lora/test_llama.py::test_llama_lora[2]
    lora/test_long_context.py::test_rotary_emb_replaced
)

### metrics ###
METRICS_CASES=(
    metrics/test_metrics.py
)

### model_executor ###
# Skip cases
#   - weight_utils.py::test_download_weights_from_hf
MODEL_EXECUTOR_CASES=(
    model_executor/test_enabled_custom_ops.py
    model_executor/test_guided_processors.py
    model_executor/weight_utils.py::test_hf_transfer_auto_activation
)

### models ###
# Skip cases
#   - test_oot_registration.py
#   - part of cases in decoder_only
#   - all cases in embedding
#   - all cases in encoder_only
MODELS_CASES=(
    models/test_registry.py
    models/decoder_only/language/test_big_models.py
    models/decoder_only/language/test_models.py
)

### mq_llm_engine ###
MQ_LLM_ENGINE=(
    mq_llm_engine
)

### multi_step ###
MULTI_STEP_CASES=(
    multi_step/test_correctness_async_llm.py
    multi_step/test_correctness_llm.py
)

### multimodal ###
MULTIMODAL_CASES=(
    multimodal/test_inputs.py
    multimodal/test_mapper.py
    multimodal/test_processor_kwargs.py
)

### prefix_caching ###
# Skip cases
#   - prefix_caching/test_disable_sliding_window.py
PREFIX_CACHING_CASES=(
    prefix_caching/test_prefix_caching.py
)

### prompt_adapter ###
# Skip all cases

### quantization ###
# Skip all cases

### sampler ###
SAMPLER_CASES=(
    samplers
)

### spec_decode ###
# Skip cases
    # spec_decode/test_multi_step_worker.py
    # spec_decode/test_scorer.py
    # spec_decode/e2e/test_eagle_correctness.py
    # spec_decode/e2e/test_integration.py
    # spec_decode/e2e/test_integration_dist_tp2.py
    # spec_decode/e2e/test_integration_dist_tp4.py
    # spec_decode/e2e/test_logprobs.py
    # spec_decode/e2e/test_medusa_correctness.py
    # spec_decode/e2e/test_mlp_correctness.py
    # spec_decode/e2e/test_multistep_correctness.py
    # spec_decode/e2e/test_ngram_correctness.py
    # spec_decode/e2e/test_seed.py
SPEC_DECODE_CASES=(
    spec_decode/e2e/test_compatibility.py
    spec_decode/test_batch_expansion.py
    spec_decode/test_dynamic_spec_decode.py
    spec_decode/test_metrics.py
    spec_decode/test_ngram_worker.py
    spec_decode/test_spec_decode_worker.py
    spec_decode/test_utils.py
)


### tensorizer_loader
TENSORIZER_LOADER_CASES=(
    tensorizer_loader
)

### tokenization ###
# Skip cases
#   - test_get_eos.py
#   - test_tokenizer.py
TOKENIZATION_CASES=(
    tokenization/test_cached_tokenizer.py
    tokenization/test_detokenize.py
    tokenization/test_tokenizer_group.py
)


### tool_use ###
# Skip all cases
TOOL_USE_CASES=(
    tool_use/test_chat_completion_request_validations.py
)

### tpu ###
# Skip all cases

### tracing ###
# Skip all cases

### weight_loading ###
WEIGHT_LOADING_CASES=(
    weight_loading/test_weight_loading.py
)

### worker ###
WORKER_CASES=(
    worker/test_encoder_decoder_model_runner.py
    worker/test_model_input.py
    worker/test_model_runner.py
    worker/test_swap.py
)

### . ###
# Skip cases
#   - test_embedded_commit.py
#   - test_scalartype.py
GLOBAL_CASES=(
    test_cache_block_hashing.py
    test_config.py
    test_inputs.py
    test_logger.py
    test_logits_processor.py
    test_regression.py
    test_sampling_params.py
    test_scalartype.py
    test_sequence.py
    test_sharded_state_loader.py::test_filter_subtensors
    test_utils.py
)

ONLINE_CASES=(
    ${ASYNC_ENGINE_CASES[@]}
    ${ENTRYPOINTS_CASES[@]}
)

OFFLINE_CASES0=(
    # ${COMPILE_CASES[@]}
    ${CORE_CASES[@]}
    ${DISTRIBUTED_CASES[@]}
    ${ENGINE_CASES[@]}
    ${KERNELS_CASES[@]}
)

OFFLINE_CASES1=(
    ${LORA_CASES[@]}
    ${METRICS_CASES[@]}
    ${MODEL_EXECUTOR_CASES[@]}
    ${MODELS_CASES[@]}
    ${MQ_LLM_ENGINE[@]}
    ${MULTI_STEP_CASES[@]}
    ${MULTIMODAL_CASES[@]}
    ${PREFIX_CACHING_CASES[@]}
)

OFFLINE_CASES2=(
    ${SAMPLER_CASES[@]}
    ${SPEC_DECODE_CASES[@]}
    ${TENSORIZER_LOADER_CASES[@]}
    ${TOKENIZATION_CASES[@]}
    ${TOOL_USE_CASES[@]}
    ${WEIGHT_LOADING_CASES[@]}
    ${WORKER_CASES[@]}
    ${GLOBAL_CASES[@]}
)

# examples/cambricon_custom_func cases
CAMBRICON_CUSTOM_FUNC_CASES=(
    expert_parallel/test_expert_parallel.py
    context_parallel/test_context_parallel.py
    context_parallel/test_context_parallel_kv8.py
)

pytest_cmd="pytest -s -v"

RED='\033[0;31m'
GREEN='\033[0;32m'
NC='\033[0m'

pip uninstall datasets -y
pip install datasets
pip install modelscope
pip install pytest_asyncio

run_ut() {
    local interval=$1
    shift
    local case_list=($@)
    case_len=${#case_list[@]}
    echo "Total ${case_len} cases"
    for((i=0;i<${case_len};i++)); do
        ut=${case_list[i]}
        echo "###############################"
        echo "Run ${i}/${case_len}, ${ut} ..."
        echo "###############################"
        sleep ${interval}
        eval ${pytest_cmd} ${ut} --junit-xml ${CI_WORK_DIR}/ut_test.xml
        ret_val=$?
        if [ $ret_val != 0 ]; then
            echo "###############################"
            echo -e "${RED}FAILED: ${ut} ... ${NC}"
            echo "###############################"
            exit $ret_val
        else
            echo "###############################"
            echo -e "${GREEN}PASS: ${ut} ... ${NC}"
            echo "###############################"
        fi
    done
}