#!/bin/bash LINK_MODELS() { mkdir -p meta-llama openai-community Qwen NousResearch mistralai THUDM baichuan-inc llava-hf ln -s /data/AE/llm/models/Llama-2-7b meta-llama/ ln -s /data/AE/llm/models/Llama-2-7b-hf meta-llama/ ln -s /data/AE/llm/models/Llama-2-7b-hf meta-llama/llama-2-7b-hf ln -s /data/AE/llm/models/Llama-2-7b-chat-hf meta-llama/Llama-2-7b-chat-hf ln -s /data/AE/llm/models/Llama-2-13b-chat-hf meta-llama/Llama-2-13b-chat-hf ln -s /data/AE/llm/models/Meta-Llama-3-8B meta-llama/ ln -s /data/AE/llm/models/Meta-Llama-3-8B-Instruct meta-llama/ ln -s /data/vllm/models/LLM-Research/Llama-3.2-1B-Instruct meta-llama/ ln -s /data/AE/llm/models/Meta-Llama-3-8B-Instruct NousResearch/ ln -s /data/AE/llm/models/Qwen1.5-7B Qwen/ ln -s /data/AE/llm/models/Qwen2-7B-Instruct Qwen/ ln -s /data/AE/llm/models/Mistral-7B-v0.1 mistralai/ ln -s /data/vllm/models/LLM-Research/Mixtral-8x7B-Instruct-v0.1 mistralai/ ln -s /data/AE/llm/models/chatglm3-6b THUDM/ ln -s /data/vllm/models/LLM-Research/Baichuan-7B baichuan-inc/ ln -s /data/vllm/vLLM_ut_hf_models/gpt2 openai-community/ ln -s /data/AE/llm/models/llava-1.5-7b-hf llava-hf/ LOCAL_MODEL=`ls /data/vllm/vLLM_ut_hf_models/` ln -s /data/vllm/vLLM_ut_hf_models/* . # create huggingface cache dir if not exists mkdir -p ~/.cache/huggingface/datasets ln -s /data/vllm/vLLM_ut_hf_models/gsm8k/ ~/.cache/huggingface/datasets/gsm8k } UNLINK_MODELS() { LOCAL_MODEL=`ls /data/vllm/vLLM_ut_hf_models/` rm -rf meta-llama openai-community Qwen NousResearch mistralai THUDM baichuan-inc llava-hf ${LOCAL_MODEL} rm -rf ~/.cache/huggingface/datasets } ### async_engine ### ASYNC_ENGINE_CASES=( async_engine/test_api_server.py async_engine/test_async_llm_engine.py async_engine/test_openapi_server.py async_engine/test_request_tracker.py ) ### basic_correctness ### # Skip cases # - test_chunked_prefill.py::test_models_with_fp8_kv_cache # - test_basic_correctness::test_model_with_failure # before test: export VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 # after test: unset VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT BASIC_CORRECTNESS_CASES=( basic_correctness/test_basic_correctness.py::test_vllm_gc_ed basic_correctness/test_basic_correctness.py::test_models basic_correctness/test_basic_correctness.py::test_models_distributed basic_correctness/test_chunked_prefill.py::test_models_distributed basic_correctness/test_chunked_prefill.py::test_models basic_correctness/test_chunked_prefill.py::test_with_prefix_caching basic_correctness/test_cpu_offload.py basic_correctness/test_preemption.py::test_chunked_prefill_recompute basic_correctness/test_preemption.py::test_preemption basic_correctness/test_preemption.py::test_preemption_infeasible ) ### benchmark ### # before test: export VLLM_LATENCY_DEBUG=1 # after test: unset VLLM_LATENCY_DEBUG BENCHMARK_CASES=( benchmark/test_benchmark_latency.py ) ### compile ### # FIXME: Pytorch 2.4 not support torch.compile, skip vllm compile cases. # Add this back when upgrade pytorch to 2.5. # COMPILE_CASES=( # compile/test_full_graph.py # compile/test_wrapper.py # ) ### core ### CORE_CASES=( core/test_chunked_prefill_scheduler.py core/test_num_computed_tokens_update.py core/test_scheduler_encoder_decoder.py core/test_scheduler.py core/test_serialization.py core/block/test_block_manager.py core/block/test_block_table.py core/block/test_common.py core/block/test_cpu_gpu_block_allocator.py core/block/test_naive_block.py core/block/test_prefix_caching_block.py core/block/e2e/test_correctness.py::test_block_manager_with_preemption core/block/e2e/test_correctness.py::test_lookahead_greedy_equality_with_preemption core/block/e2e/test_correctness.py::test_chunked_prefill_block_manager core/block/e2e/test_correctness.py::test_block_manager_prefix_caching_enabled_with_preemption core/block/e2e/test_correctness.py::test_auto_prefix_caching_with_preemption core/block/e2e/test_correctness.py::test_auto_prefix_caching_after_evition_start core/block/e2e/test_correctness_sliding_window.py ) ### distributed ### # Skip cases # - test_custom_all_reduce.py # - test_distributed_oot.py # - test_multi_node_assignment.py # - test_same_node.py DISTRIBUTED_CASES=( distributed/test_pipeline_parallel.py distributed/test_pipeline_partition.py distributed/test_pp_cudagraph.py distributed/test_shm_broadcast.py distributed/test_utils.py ) # before test: UNLINK_MODELS # after test: LINK_MODELS DISTRIBUTED_NEED_PACK_CASES=( distributed/test_comm_ops.py ) ### engine ### ENGINE_CASES=( engine/test_arg_utils.py engine/test_computed_prefix_blocks.py engine/test_custom_executor.py engine/test_detokenization.py engine/test_multiproc_workers.py engine/test_short_mm_context.py engine/test_skip_tokenizer_init.py engine/test_stop_reason.py engine/test_stop_strings.py engine/output_processor/test_multi_step.py engine/output_processor/test_stop_checker.py ) ### entrypoints ### # Skip cases # - entrypoints/llm/test_encode.py # - entrypoints/llm/test_generate_multiple_loras.py # - entrypoints/openai/test_accuracy.py # - entrypoints/openai/test_audio.py # - entrypoints/openai/test_chat.py # - entrypoints/openai/test_completion.py # - entrypoints/openai/test_embedding.py # - entrypoints/openai/test_encoder_decoder.py # - entrypoints/openai/test_metrics.py # - entrypoints/openai/test_models.py # - entrypoints/openai/test_oot_registration.py # - entrypoints/openai/test_return_tokens_as_ids.py # - entrypoints/openai/test_shutdown.py # - entrypoints/openai/test_tokenization.py # - entrypoints/openai/test_vision.py # - entrypoints/openai/test_run_batch.py::test_embeddings ENTRYPOINTS_CASES=( entrypoints/test_chat_utils.py entrypoints/llm/test_chat.py entrypoints/llm/test_generate.py entrypoints/llm/test_guided_generate.py entrypoints/llm/test_lazy_outlines.py entrypoints/llm/test_prompt_validation.py entrypoints/offline_mode/test_offline_mode.py entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py entrypoints/openai/test_basic.py entrypoints/openai/test_chat_template.py entrypoints/openai/test_chunked_prompt.py entrypoints/openai/test_cli_args.py entrypoints/openai/test_prompt_validation.py entrypoints/openai/test_run_batch.py::test_empty_file entrypoints/openai/test_run_batch.py::test_completions entrypoints/openai/test_run_batch.py::test_completions_invalid_input entrypoints/openai/test_serving_chat.py entrypoints/openai/test_serving_engine.py ) ### kernels ### # Skip cases # - other op/layer test KERNELS_CASES=( kernels/bt_torch_ops kernels/test_advance_step.py kernels/test_feed_forward.py ) ### lora ### # triton kernel tests are in TRITON_CASE # - lora/test_llama.py::test_llama_lora_warmup # - lora/test_tokenizer_group.py # NOTE: The following tests requires 4 gpus, which can not run # in ci environment. We should check these tests after we do # some modifications for lora. # lora/test_long_context.py::test_batched_rope_kernel # lora/test_long_context.py::test_self_consistency # lora/test_long_context.py::test_quality # lora/test_long_context.py::test_max_len LORA_CASES=( lora/test_layers.py lora/test_lora_checkpoints.py lora/test_lora_huggingface.py lora/test_lora_manager.py lora/test_utils.py lora/test_worker.py lora/test_baichuan.py lora/test_chatglm3.py lora/test_llama.py::test_llama_lora[1] lora/test_llama.py::test_llama_lora[2] lora/test_long_context.py::test_rotary_emb_replaced ) ### metrics ### METRICS_CASES=( metrics/test_metrics.py ) ### model_executor ### # Skip cases # - weight_utils.py::test_download_weights_from_hf MODEL_EXECUTOR_CASES=( model_executor/test_enabled_custom_ops.py model_executor/test_guided_processors.py model_executor/weight_utils.py::test_hf_transfer_auto_activation ) ### models ### # Skip cases # - test_oot_registration.py # - part of cases in decoder_only # - all cases in embedding # - all cases in encoder_only MODELS_CASES=( models/test_registry.py models/decoder_only/language/test_big_models.py models/decoder_only/language/test_models.py ) ### mq_llm_engine ### MQ_LLM_ENGINE=( mq_llm_engine ) ### multi_step ### MULTI_STEP_CASES=( multi_step/test_correctness_async_llm.py multi_step/test_correctness_llm.py ) ### multimodal ### MULTIMODAL_CASES=( multimodal/test_inputs.py multimodal/test_mapper.py multimodal/test_processor_kwargs.py ) ### prefix_caching ### # Skip cases # - prefix_caching/test_disable_sliding_window.py PREFIX_CACHING_CASES=( prefix_caching/test_prefix_caching.py ) ### prompt_adapter ### # Skip all cases ### quantization ### # Skip all cases ### sampler ### SAMPLER_CASES=( samplers ) ### spec_decode ### # Skip cases # spec_decode/test_multi_step_worker.py # spec_decode/test_scorer.py # spec_decode/e2e/test_eagle_correctness.py # spec_decode/e2e/test_integration.py # spec_decode/e2e/test_integration_dist_tp2.py # spec_decode/e2e/test_integration_dist_tp4.py # spec_decode/e2e/test_logprobs.py # spec_decode/e2e/test_medusa_correctness.py # spec_decode/e2e/test_mlp_correctness.py # spec_decode/e2e/test_multistep_correctness.py # spec_decode/e2e/test_ngram_correctness.py # spec_decode/e2e/test_seed.py SPEC_DECODE_CASES=( spec_decode/e2e/test_compatibility.py spec_decode/test_batch_expansion.py spec_decode/test_dynamic_spec_decode.py spec_decode/test_metrics.py spec_decode/test_ngram_worker.py spec_decode/test_spec_decode_worker.py spec_decode/test_utils.py ) ### tensorizer_loader TENSORIZER_LOADER_CASES=( tensorizer_loader ) ### tokenization ### # Skip cases # - test_get_eos.py # - test_tokenizer.py TOKENIZATION_CASES=( tokenization/test_cached_tokenizer.py tokenization/test_detokenize.py tokenization/test_tokenizer_group.py ) ### tool_use ### # Skip all cases TOOL_USE_CASES=( tool_use/test_chat_completion_request_validations.py ) ### tpu ### # Skip all cases ### tracing ### # Skip all cases ### weight_loading ### WEIGHT_LOADING_CASES=( weight_loading/test_weight_loading.py ) ### worker ### WORKER_CASES=( worker/test_encoder_decoder_model_runner.py worker/test_model_input.py worker/test_model_runner.py worker/test_swap.py ) ### . ### # Skip cases # - test_embedded_commit.py # - test_scalartype.py GLOBAL_CASES=( test_cache_block_hashing.py test_config.py test_inputs.py test_logger.py test_logits_processor.py test_regression.py test_sampling_params.py test_scalartype.py test_sequence.py test_sharded_state_loader.py::test_filter_subtensors test_utils.py ) ONLINE_CASES=( ${ASYNC_ENGINE_CASES[@]} ${ENTRYPOINTS_CASES[@]} ) OFFLINE_CASES0=( # ${COMPILE_CASES[@]} ${CORE_CASES[@]} ${DISTRIBUTED_CASES[@]} ${ENGINE_CASES[@]} ${KERNELS_CASES[@]} ) OFFLINE_CASES1=( ${LORA_CASES[@]} ${METRICS_CASES[@]} ${MODEL_EXECUTOR_CASES[@]} ${MODELS_CASES[@]} ${MQ_LLM_ENGINE[@]} ${MULTI_STEP_CASES[@]} ${MULTIMODAL_CASES[@]} ${PREFIX_CACHING_CASES[@]} ) OFFLINE_CASES2=( ${SAMPLER_CASES[@]} ${SPEC_DECODE_CASES[@]} ${TENSORIZER_LOADER_CASES[@]} ${TOKENIZATION_CASES[@]} ${TOOL_USE_CASES[@]} ${WEIGHT_LOADING_CASES[@]} ${WORKER_CASES[@]} ${GLOBAL_CASES[@]} ) # examples/cambricon_custom_func cases CAMBRICON_CUSTOM_FUNC_CASES=( expert_parallel/test_expert_parallel.py context_parallel/test_context_parallel.py context_parallel/test_context_parallel_kv8.py ) pytest_cmd="pytest -s -v" RED='\033[0;31m' GREEN='\033[0;32m' NC='\033[0m' pip uninstall datasets -y pip install datasets pip install modelscope pip install pytest_asyncio run_ut() { local interval=$1 shift local case_list=($@) case_len=${#case_list[@]} echo "Total ${case_len} cases" for((i=0;i<${case_len};i++)); do ut=${case_list[i]} echo "###############################" echo "Run ${i}/${case_len}, ${ut} ..." echo "###############################" sleep ${interval} eval ${pytest_cmd} ${ut} --junit-xml ${CI_WORK_DIR}/ut_test.xml ret_val=$? if [ $ret_val != 0 ]; then echo "###############################" echo -e "${RED}FAILED: ${ut} ... ${NC}" echo "###############################" exit $ret_val else echo "###############################" echo -e "${GREEN}PASS: ${ut} ... ${NC}" echo "###############################" fi done }