From a692c623576810a3e16eb7297f2aec67d9ed9104 Mon Sep 17 00:00:00 2001 From: wangjing Date: Thu, 7 Aug 2025 07:16:36 +0000 Subject: [PATCH] init --- vllm/_C.py | 266 ++++ vllm/__init__.py | 29 + vllm/__pycache__/_C.cpython-310.pyc | Bin 0 -> 9011 bytes vllm/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 901 bytes vllm/__pycache__/_moe_C.cpython-310.pyc | Bin 0 -> 402 bytes vllm/__pycache__/block.cpython-310.pyc | Bin 0 -> 2749 bytes vllm/__pycache__/config.cpython-310.pyc | Bin 0 -> 21036 bytes vllm/__pycache__/logger.cpython-310.pyc | Bin 0 -> 1787 bytes vllm/__pycache__/outputs.cpython-310.pyc | Bin 0 -> 4871 bytes vllm/__pycache__/prefix.cpython-310.pyc | Bin 0 -> 3451 bytes .../sampling_params.cpython-310.pyc | Bin 0 -> 11436 bytes vllm/__pycache__/sequence.cpython-310.pyc | Bin 0 -> 18214 bytes vllm/__pycache__/test_utils.cpython-310.pyc | Bin 0 -> 1093 bytes vllm/__pycache__/utils.cpython-310.pyc | Bin 0 -> 9700 bytes vllm/_moe_C.py | 5 + vllm/block.py | 72 + vllm/config.py | 689 ++++++++++ vllm/core/__init__.py | 0 .../core/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 154 bytes .../__pycache__/block_manager.cpython-310.pyc | Bin 0 -> 9409 bytes vllm/core/__pycache__/policy.cpython-310.pyc | Bin 0 -> 1779 bytes .../__pycache__/scheduler.cpython-310.pyc | Bin 0 -> 12463 bytes vllm/core/block_manager.py | 330 +++++ vllm/core/policy.py | 47 + vllm/core/scheduler.py | 498 +++++++ vllm/engine/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 156 bytes .../__pycache__/arg_utils.cpython-310.pyc | Bin 0 -> 10190 bytes .../async_llm_engine.cpython-310.pyc | Bin 0 -> 19415 bytes .../__pycache__/llm_engine.cpython-310.pyc | Bin 0 -> 28319 bytes .../__pycache__/metrics.cpython-310.pyc | Bin 0 -> 6661 bytes .../__pycache__/ray_utils.cpython-310.pyc | Bin 0 -> 4764 bytes vllm/engine/arg_utils.py | 341 +++++ vllm/engine/async_llm_engine.py | 689 ++++++++++ vllm/engine/llm_engine.py | 1209 ++++++++++++++++ vllm/engine/metrics.py | 225 +++ vllm/engine/ray_utils.py | 157 +++ vllm/entrypoints/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 161 bytes .../__pycache__/api_server.cpython-310.pyc | Bin 0 -> 3505 bytes .../__pycache__/llm.cpython-310.pyc | Bin 0 -> 8819 bytes vllm/entrypoints/api_server.py | 105 ++ vllm/entrypoints/llm.py | 220 +++ vllm/entrypoints/openai/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 168 bytes .../__pycache__/api_server.cpython-310.pyc | Bin 0 -> 7136 bytes .../__pycache__/protocol.cpython-310.pyc | Bin 0 -> 11959 bytes .../__pycache__/serving_chat.cpython-310.pyc | Bin 0 -> 7459 bytes .../serving_completion.cpython-310.pyc | Bin 0 -> 8424 bytes .../serving_engine.cpython-310.pyc | Bin 0 -> 5855 bytes vllm/entrypoints/openai/api_server.py | 251 ++++ vllm/entrypoints/openai/protocol.py | 323 +++++ vllm/entrypoints/openai/serving_chat.py | 307 +++++ vllm/entrypoints/openai/serving_completion.py | 361 +++++ vllm/entrypoints/openai/serving_engine.py | 172 +++ vllm/logger.py | 61 + vllm/lora/__init__.py | 0 .../lora/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 154 bytes vllm/lora/__pycache__/layers.cpython-310.pyc | Bin 0 -> 24686 bytes vllm/lora/__pycache__/lora.cpython-310.pyc | Bin 0 -> 5211 bytes vllm/lora/__pycache__/models.cpython-310.pyc | Bin 0 -> 19005 bytes vllm/lora/__pycache__/punica.cpython-310.pyc | Bin 0 -> 4393 bytes vllm/lora/__pycache__/request.cpython-310.pyc | Bin 0 -> 1433 bytes vllm/lora/__pycache__/utils.cpython-310.pyc | Bin 0 -> 1493 bytes .../worker_manager.cpython-310.pyc | Bin 0 -> 8409 bytes vllm/lora/layers.py | 979 +++++++++++++ vllm/lora/lora.py | 160 +++ vllm/lora/models.py | 620 +++++++++ vllm/lora/punica.py | 170 +++ vllm/lora/request.py | 32 + vllm/lora/utils.py | 39 + vllm/lora/worker_manager.py | 238 ++++ vllm/model_executor/__init__.py | 10 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 440 bytes .../guided_decoding.cpython-310.pyc | Bin 0 -> 2930 bytes .../guided_logits_processors.cpython-310.pyc | Bin 0 -> 4206 bytes .../input_metadata.cpython-310.pyc | Bin 0 -> 1852 bytes .../__pycache__/model_loader.cpython-310.pyc | Bin 0 -> 2892 bytes .../neuron_model_loader.cpython-310.pyc | Bin 0 -> 1941 bytes .../sampling_metadata.cpython-310.pyc | Bin 0 -> 6385 bytes .../__pycache__/utils.cpython-310.pyc | Bin 0 -> 1777 bytes .../__pycache__/weight_utils.cpython-310.pyc | Bin 0 -> 8864 bytes vllm/model_executor/guided_decoding.py | 99 ++ .../guided_logits_processors.py | 129 ++ vllm/model_executor/input_metadata.py | 54 + vllm/model_executor/layers/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 171 bytes .../__pycache__/activation.cpython-310.pyc | Bin 0 -> 7700 bytes .../__pycache__/attention.cpython-310.pyc | Bin 0 -> 8623 bytes .../__pycache__/layernorm.cpython-310.pyc | Bin 0 -> 6656 bytes .../layers/__pycache__/linear.cpython-310.pyc | Bin 0 -> 20727 bytes .../rejection_sampler.cpython-310.pyc | Bin 0 -> 12132 bytes .../rotary_embedding.cpython-310.pyc | Bin 0 -> 13686 bytes .../__pycache__/sampler.cpython-310.pyc | Bin 0 -> 13950 bytes .../vocab_parallel_embedding.cpython-310.pyc | Bin 0 -> 5061 bytes vllm/model_executor/layers/activation.py | 237 ++++ vllm/model_executor/layers/attention.py | 542 ++++++++ .../layers/fused_moe/__init__.py | 5 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 285 bytes .../__pycache__/fused_moe.cpython-310.pyc | Bin 0 -> 10707 bytes .../layers/fused_moe/fused_moe.py | 377 +++++ vllm/model_executor/layers/layernorm.py | 216 +++ vllm/model_executor/layers/linear.py | 754 ++++++++++ .../layers/quantization/__init__.py | 28 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 1105 bytes .../__pycache__/awq.cpython-310.pyc | Bin 0 -> 4836 bytes .../__pycache__/base_config.cpython-310.pyc | Bin 0 -> 2789 bytes .../__pycache__/gptq.cpython-310.pyc | Bin 0 -> 5816 bytes .../__pycache__/marlin.cpython-310.pyc | Bin 0 -> 5354 bytes .../__pycache__/smoothquant.cpython-310.pyc | Bin 0 -> 4733 bytes .../__pycache__/squeezellm.cpython-310.pyc | Bin 0 -> 4529 bytes .../model_executor/layers/quantization/awq.py | 170 +++ .../layers/quantization/base_config.py | 64 + .../layers/quantization/gptq.py | 218 +++ .../layers/quantization/marlin.py | 210 +++ .../layers/quantization/smoothquant.py | 111 ++ .../layers/quantization/squeezellm.py | 129 ++ .../layers/rejection_sampler.py | 392 ++++++ .../model_executor/layers/rotary_embedding.py | 562 ++++++++ vllm/model_executor/layers/sampler.py | 598 ++++++++ .../layers/triton_kernel/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 185 bytes .../prefix_prefill.cpython-310.pyc | Bin 0 -> 10306 bytes .../layers/triton_kernel/prefix_prefill.py | 745 ++++++++++ .../layers/vocab_parallel_embedding.py | 151 ++ vllm/model_executor/model_loader.py | 137 ++ vllm/model_executor/models/__init__.py | 107 ++ .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 3244 bytes .../__pycache__/baichuan.cpython-310.pyc | Bin 0 -> 10074 bytes .../models/__pycache__/bloom.cpython-310.pyc | Bin 0 -> 8148 bytes .../__pycache__/chatglm.cpython-310.pyc | Bin 0 -> 9522 bytes .../models/__pycache__/cpm.cpython-310.pyc | Bin 0 -> 8929 bytes .../models/__pycache__/decilm.cpython-310.pyc | Bin 0 -> 3318 bytes .../__pycache__/deepseek.cpython-310.pyc | Bin 0 -> 11230 bytes .../models/__pycache__/falcon.cpython-310.pyc | Bin 0 -> 10180 bytes .../models/__pycache__/gemma.cpython-310.pyc | Bin 0 -> 8525 bytes .../models/__pycache__/gpt2.cpython-310.pyc | Bin 0 -> 7257 bytes .../__pycache__/gpt_bigcode.cpython-310.pyc | Bin 0 -> 7284 bytes .../models/__pycache__/gpt_j.cpython-310.pyc | Bin 0 -> 7529 bytes .../__pycache__/gpt_neox.cpython-310.pyc | Bin 0 -> 7613 bytes .../__pycache__/internlm2.cpython-310.pyc | Bin 0 -> 8707 bytes .../models/__pycache__/llama.cpython-310.pyc | Bin 0 -> 9430 bytes .../__pycache__/llama_smooth.cpython-310.pyc | Bin 0 -> 9875 bytes .../__pycache__/mixtral.cpython-310.pyc | Bin 0 -> 11106 bytes .../__pycache__/mixtral_quant.cpython-310.pyc | Bin 0 -> 10419 bytes .../models/__pycache__/mpt.cpython-310.pyc | Bin 0 -> 8122 bytes .../models/__pycache__/olmo.cpython-310.pyc | Bin 0 -> 8949 bytes .../models/__pycache__/opt.cpython-310.pyc | Bin 0 -> 8548 bytes .../models/__pycache__/phi.cpython-310.pyc | Bin 0 -> 7414 bytes .../models/__pycache__/qwen.cpython-310.pyc | Bin 0 -> 7879 bytes .../models/__pycache__/qwen2.cpython-310.pyc | Bin 0 -> 8565 bytes .../__pycache__/stablelm.cpython-310.pyc | Bin 0 -> 8390 bytes .../__pycache__/starcoder2.cpython-310.pyc | Bin 0 -> 8064 bytes vllm/model_executor/models/baichuan.py | 386 ++++++ vllm/model_executor/models/bloom.py | 330 +++++ vllm/model_executor/models/chatglm.py | 396 ++++++ vllm/model_executor/models/cpm.py | 368 +++++ vllm/model_executor/models/decilm.py | 127 ++ vllm/model_executor/models/deepseek.py | 444 ++++++ vllm/model_executor/models/falcon.py | 447 ++++++ vllm/model_executor/models/gemma.py | 346 +++++ vllm/model_executor/models/gpt2.py | 273 ++++ vllm/model_executor/models/gpt_bigcode.py | 279 ++++ vllm/model_executor/models/gpt_j.py | 284 ++++ vllm/model_executor/models/gpt_neox.py | 294 ++++ vllm/model_executor/models/internlm2.py | 325 +++++ vllm/model_executor/models/llama.py | 391 ++++++ vllm/model_executor/models/llama_smooth.py | 409 ++++++ vllm/model_executor/models/mixtral.py | 454 ++++++ vllm/model_executor/models/mixtral_quant.py | 412 ++++++ vllm/model_executor/models/mpt.py | 298 ++++ vllm/model_executor/models/olmo.py | 380 +++++ vllm/model_executor/models/opt.py | 354 +++++ vllm/model_executor/models/phi.py | 305 ++++ vllm/model_executor/models/qwen.py | 288 ++++ vllm/model_executor/models/qwen2.py | 340 +++++ vllm/model_executor/models/stablelm.py | 303 ++++ vllm/model_executor/models/starcoder2.py | 310 +++++ vllm/model_executor/neuron_model_loader.py | 66 + .../model_executor/parallel_utils/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 179 bytes .../communication_op.cpython-310.pyc | Bin 0 -> 5435 bytes .../__pycache__/cupy_utils.cpython-310.pyc | Bin 0 -> 4048 bytes .../custom_all_reduce.cpython-310.pyc | Bin 0 -> 4980 bytes .../parallel_state.cpython-310.pyc | Bin 0 -> 7106 bytes .../__pycache__/utils.cpython-310.pyc | Bin 0 -> 1635 bytes .../parallel_utils/communication_op.py | 213 +++ .../parallel_utils/cupy_utils.py | 130 ++ .../parallel_utils/custom_all_reduce.py | 247 ++++ .../parallel_utils/parallel_state.py | 245 ++++ vllm/model_executor/parallel_utils/utils.py | 48 + vllm/model_executor/sampling_metadata.py | 239 ++++ vllm/model_executor/utils.py | 52 + vllm/model_executor/weight_utils.py | 300 ++++ vllm/outputs.py | 141 ++ vllm/prefix.py | 87 ++ vllm/sampling_params.py | 279 ++++ vllm/sequence.py | 497 +++++++ vllm/test_utils.py | 41 + vllm/transformers_utils/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 168 bytes .../__pycache__/config.cpython-310.pyc | Bin 0 -> 1408 bytes .../__pycache__/tokenizer.cpython-310.pyc | Bin 0 -> 5729 bytes vllm/transformers_utils/config.py | 52 + vllm/transformers_utils/configs/__init__.py | 16 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 600 bytes .../__pycache__/chatglm.cpython-310.pyc | Bin 0 -> 1753 bytes .../configs/__pycache__/cpm.cpython-310.pyc | Bin 0 -> 2716 bytes .../__pycache__/falcon.cpython-310.pyc | Bin 0 -> 1870 bytes .../configs/__pycache__/mpt.cpython-310.pyc | Bin 0 -> 10529 bytes .../__pycache__/starcoder2.cpython-310.pyc | Bin 0 -> 5931 bytes vllm/transformers_utils/configs/chatglm.py | 68 + vllm/transformers_utils/configs/cpm.py | 113 ++ vllm/transformers_utils/configs/falcon.py | 87 ++ vllm/transformers_utils/configs/mpt.py | 232 ++++ vllm/transformers_utils/configs/starcoder2.py | 127 ++ vllm/transformers_utils/tokenizer.py | 245 ++++ .../transformers_utils/tokenizers/__init__.py | 5 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 288 bytes .../__pycache__/baichuan.cpython-310.pyc | Bin 0 -> 7886 bytes .../transformers_utils/tokenizers/baichuan.py | 263 ++++ vllm/utils.py | 311 +++++ vllm/worker/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 156 bytes .../__pycache__/cache_engine.cpython-310.pyc | Bin 0 -> 5608 bytes .../__pycache__/model_runner.cpython-310.pyc | Bin 0 -> 17928 bytes .../__pycache__/neuron_worker.cpython-310.pyc | Bin 0 -> 5424 bytes .../worker/__pycache__/worker.cpython-310.pyc | Bin 0 -> 8054 bytes vllm/worker/cache_engine.py | 215 +++ vllm/worker/model_runner.py | 1223 +++++++++++++++++ vllm/worker/neuron_worker.py | 191 +++ vllm/worker/worker.py | 354 +++++ 232 files changed, 29270 insertions(+) create mode 100644 vllm/_C.py create mode 100644 vllm/__init__.py create mode 100644 vllm/__pycache__/_C.cpython-310.pyc create mode 100644 vllm/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/__pycache__/_moe_C.cpython-310.pyc create mode 100644 vllm/__pycache__/block.cpython-310.pyc create mode 100644 vllm/__pycache__/config.cpython-310.pyc create mode 100644 vllm/__pycache__/logger.cpython-310.pyc create mode 100644 vllm/__pycache__/outputs.cpython-310.pyc create mode 100644 vllm/__pycache__/prefix.cpython-310.pyc create mode 100644 vllm/__pycache__/sampling_params.cpython-310.pyc create mode 100644 vllm/__pycache__/sequence.cpython-310.pyc create mode 100644 vllm/__pycache__/test_utils.cpython-310.pyc create mode 100644 vllm/__pycache__/utils.cpython-310.pyc create mode 100644 vllm/_moe_C.py create mode 100644 vllm/block.py create mode 100644 vllm/config.py create mode 100644 vllm/core/__init__.py create mode 100644 vllm/core/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/core/__pycache__/block_manager.cpython-310.pyc create mode 100644 vllm/core/__pycache__/policy.cpython-310.pyc create mode 100644 vllm/core/__pycache__/scheduler.cpython-310.pyc create mode 100644 vllm/core/block_manager.py create mode 100644 vllm/core/policy.py create mode 100644 vllm/core/scheduler.py create mode 100644 vllm/engine/__init__.py create mode 100644 vllm/engine/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/engine/__pycache__/arg_utils.cpython-310.pyc create mode 100644 vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc create mode 100644 vllm/engine/__pycache__/llm_engine.cpython-310.pyc create mode 100644 vllm/engine/__pycache__/metrics.cpython-310.pyc create mode 100644 vllm/engine/__pycache__/ray_utils.cpython-310.pyc create mode 100644 vllm/engine/arg_utils.py create mode 100644 vllm/engine/async_llm_engine.py create mode 100644 vllm/engine/llm_engine.py create mode 100644 vllm/engine/metrics.py create mode 100644 vllm/engine/ray_utils.py create mode 100644 vllm/entrypoints/__init__.py create mode 100644 vllm/entrypoints/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/entrypoints/__pycache__/api_server.cpython-310.pyc create mode 100644 vllm/entrypoints/__pycache__/llm.cpython-310.pyc create mode 100644 vllm/entrypoints/api_server.py create mode 100644 vllm/entrypoints/llm.py create mode 100644 vllm/entrypoints/openai/__init__.py create mode 100644 vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc create mode 100644 vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc create mode 100644 vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc create mode 100644 vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc create mode 100644 vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc create mode 100644 vllm/entrypoints/openai/api_server.py create mode 100644 vllm/entrypoints/openai/protocol.py create mode 100644 vllm/entrypoints/openai/serving_chat.py create mode 100644 vllm/entrypoints/openai/serving_completion.py create mode 100644 vllm/entrypoints/openai/serving_engine.py create mode 100644 vllm/logger.py create mode 100644 vllm/lora/__init__.py create mode 100644 vllm/lora/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/layers.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/lora.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/models.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/punica.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/request.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/utils.cpython-310.pyc create mode 100644 vllm/lora/__pycache__/worker_manager.cpython-310.pyc create mode 100644 vllm/lora/layers.py create mode 100644 vllm/lora/lora.py create mode 100644 vllm/lora/models.py create mode 100644 vllm/lora/punica.py create mode 100644 vllm/lora/request.py create mode 100644 vllm/lora/utils.py create mode 100644 vllm/lora/worker_manager.py create mode 100644 vllm/model_executor/__init__.py create mode 100644 vllm/model_executor/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/guided_decoding.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/guided_logits_processors.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/input_metadata.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/model_loader.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/neuron_model_loader.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/utils.cpython-310.pyc create mode 100644 vllm/model_executor/__pycache__/weight_utils.cpython-310.pyc create mode 100644 vllm/model_executor/guided_decoding.py create mode 100644 vllm/model_executor/guided_logits_processors.py create mode 100644 vllm/model_executor/input_metadata.py create mode 100644 vllm/model_executor/layers/__init__.py create mode 100644 vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/attention.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc create mode 100644 vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc create mode 100644 vllm/model_executor/layers/activation.py create mode 100644 vllm/model_executor/layers/attention.py create mode 100644 vllm/model_executor/layers/fused_moe/__init__.py create mode 100644 vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc create mode 100644 vllm/model_executor/layers/fused_moe/fused_moe.py create mode 100644 vllm/model_executor/layers/layernorm.py create mode 100644 vllm/model_executor/layers/linear.py create mode 100644 vllm/model_executor/layers/quantization/__init__.py create mode 100644 vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/__pycache__/smoothquant.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/__pycache__/squeezellm.cpython-310.pyc create mode 100644 vllm/model_executor/layers/quantization/awq.py create mode 100644 vllm/model_executor/layers/quantization/base_config.py create mode 100644 vllm/model_executor/layers/quantization/gptq.py create mode 100644 vllm/model_executor/layers/quantization/marlin.py create mode 100644 vllm/model_executor/layers/quantization/smoothquant.py create mode 100644 vllm/model_executor/layers/quantization/squeezellm.py create mode 100644 vllm/model_executor/layers/rejection_sampler.py create mode 100644 vllm/model_executor/layers/rotary_embedding.py create mode 100644 vllm/model_executor/layers/sampler.py create mode 100644 vllm/model_executor/layers/triton_kernel/__init__.py create mode 100644 vllm/model_executor/layers/triton_kernel/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/layers/triton_kernel/__pycache__/prefix_prefill.cpython-310.pyc create mode 100644 vllm/model_executor/layers/triton_kernel/prefix_prefill.py create mode 100644 vllm/model_executor/layers/vocab_parallel_embedding.py create mode 100644 vllm/model_executor/model_loader.py create mode 100644 vllm/model_executor/models/__init__.py create mode 100644 vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/cpm.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/llama.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/llama_smooth.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/opt.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/phi.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc create mode 100644 vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc create mode 100644 vllm/model_executor/models/baichuan.py create mode 100644 vllm/model_executor/models/bloom.py create mode 100644 vllm/model_executor/models/chatglm.py create mode 100644 vllm/model_executor/models/cpm.py create mode 100644 vllm/model_executor/models/decilm.py create mode 100644 vllm/model_executor/models/deepseek.py create mode 100644 vllm/model_executor/models/falcon.py create mode 100644 vllm/model_executor/models/gemma.py create mode 100644 vllm/model_executor/models/gpt2.py create mode 100644 vllm/model_executor/models/gpt_bigcode.py create mode 100644 vllm/model_executor/models/gpt_j.py create mode 100644 vllm/model_executor/models/gpt_neox.py create mode 100644 vllm/model_executor/models/internlm2.py create mode 100644 vllm/model_executor/models/llama.py create mode 100644 vllm/model_executor/models/llama_smooth.py create mode 100644 vllm/model_executor/models/mixtral.py create mode 100644 vllm/model_executor/models/mixtral_quant.py create mode 100644 vllm/model_executor/models/mpt.py create mode 100644 vllm/model_executor/models/olmo.py create mode 100644 vllm/model_executor/models/opt.py create mode 100644 vllm/model_executor/models/phi.py create mode 100644 vllm/model_executor/models/qwen.py create mode 100644 vllm/model_executor/models/qwen2.py create mode 100644 vllm/model_executor/models/stablelm.py create mode 100644 vllm/model_executor/models/starcoder2.py create mode 100644 vllm/model_executor/neuron_model_loader.py create mode 100644 vllm/model_executor/parallel_utils/__init__.py create mode 100644 vllm/model_executor/parallel_utils/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/model_executor/parallel_utils/__pycache__/communication_op.cpython-310.pyc create mode 100644 vllm/model_executor/parallel_utils/__pycache__/cupy_utils.cpython-310.pyc create mode 100644 vllm/model_executor/parallel_utils/__pycache__/custom_all_reduce.cpython-310.pyc create mode 100644 vllm/model_executor/parallel_utils/__pycache__/parallel_state.cpython-310.pyc create mode 100644 vllm/model_executor/parallel_utils/__pycache__/utils.cpython-310.pyc create mode 100644 vllm/model_executor/parallel_utils/communication_op.py create mode 100644 vllm/model_executor/parallel_utils/cupy_utils.py create mode 100644 vllm/model_executor/parallel_utils/custom_all_reduce.py create mode 100644 vllm/model_executor/parallel_utils/parallel_state.py create mode 100644 vllm/model_executor/parallel_utils/utils.py create mode 100644 vllm/model_executor/sampling_metadata.py create mode 100644 vllm/model_executor/utils.py create mode 100644 vllm/model_executor/weight_utils.py create mode 100644 vllm/outputs.py create mode 100644 vllm/prefix.py create mode 100644 vllm/sampling_params.py create mode 100644 vllm/sequence.py create mode 100644 vllm/test_utils.py create mode 100644 vllm/transformers_utils/__init__.py create mode 100644 vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/transformers_utils/__pycache__/config.cpython-310.pyc create mode 100644 vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc create mode 100644 vllm/transformers_utils/config.py create mode 100644 vllm/transformers_utils/configs/__init__.py create mode 100644 vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc create mode 100644 vllm/transformers_utils/configs/__pycache__/cpm.cpython-310.pyc create mode 100644 vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc create mode 100644 vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc create mode 100644 vllm/transformers_utils/configs/__pycache__/starcoder2.cpython-310.pyc create mode 100644 vllm/transformers_utils/configs/chatglm.py create mode 100644 vllm/transformers_utils/configs/cpm.py create mode 100644 vllm/transformers_utils/configs/falcon.py create mode 100644 vllm/transformers_utils/configs/mpt.py create mode 100644 vllm/transformers_utils/configs/starcoder2.py create mode 100644 vllm/transformers_utils/tokenizer.py create mode 100644 vllm/transformers_utils/tokenizers/__init__.py create mode 100644 vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/transformers_utils/tokenizers/__pycache__/baichuan.cpython-310.pyc create mode 100644 vllm/transformers_utils/tokenizers/baichuan.py create mode 100644 vllm/utils.py create mode 100644 vllm/worker/__init__.py create mode 100644 vllm/worker/__pycache__/__init__.cpython-310.pyc create mode 100644 vllm/worker/__pycache__/cache_engine.cpython-310.pyc create mode 100644 vllm/worker/__pycache__/model_runner.cpython-310.pyc create mode 100644 vllm/worker/__pycache__/neuron_worker.cpython-310.pyc create mode 100644 vllm/worker/__pycache__/worker.cpython-310.pyc create mode 100644 vllm/worker/cache_engine.py create mode 100644 vllm/worker/model_runner.py create mode 100644 vllm/worker/neuron_worker.py create mode 100644 vllm/worker/worker.py diff --git a/vllm/_C.py b/vllm/_C.py new file mode 100644 index 0000000..0980701 --- /dev/null +++ b/vllm/_C.py @@ -0,0 +1,266 @@ +from typing import Optional + +import torch +import torch.nn.functional as F + +import ixformer +import ixformer.functions as ixf_F +from ixformer._C import ReduceOp +from ixformer._C import _distributed as cdist +from ixformer._C._distributed import is_initialized, get_default_comm_group +from ixformer.contrib.torch.extension import ixformer_torch as ixft +from ixformer.contrib.torch.data_type_mapping import torch_to_ixformer_dtype + + +class ops(): + # activations + @staticmethod + def silu_and_mul(output, x): + ixf_F.silu_and_mul(x, output) + + @staticmethod + def gelu_and_mul(output, x): + ixf_F.gelu_and_mul(x, output) + + @staticmethod + def gelu_new(output, x): + return F.gelu(x,"tanh") + + @staticmethod + def gelu_fast(output, x): + return F.gelu(x,"tanh") + + # rms norm + @staticmethod + def rms_norm(output, x, weight, epsilon): + ixf_F.rms_norm(x, weight, output, epsilon) + + @staticmethod + def fused_add_rms_norm(input, residual, weight, epsilon, scale): + ixf_F.fused_add_rms_norm(input, residual, weight, epsilon, scale) + + # rotary embedding + @staticmethod + def rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox_style): + ixf_F.vllm_rotary_embedding_neox(positions, query, key, head_size, + cos_sin_cache, is_neox_style) + + # paged attention + @staticmethod + def paged_attention_v1( + output, + query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes=None, + kv_cache_dtype=None, + ): + return ixf_F.vllm_single_query_cached_kv_attention( + output, + query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + + @staticmethod + def paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes=None, + kv_cache_dtype=None, + use_sqrt_alibi=False, + ): + return ixf_F.vllm_single_query_cached_kv_attention_v2( + output, + 256, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + use_sqrt_alibi, + ) + + # awq + @staticmethod + def awq_gemm(x, qweight, scales, qzeros, pack_factor): + return ixf_F.quantized_linear(x,qweight,scales,"awq",32 // pack_factor,qzeros,None,group_size=128) + + @staticmethod + def awq_dequantize(qweight, scales, qzeros, holder1, holder2, holder3): + raise NotImplementedError() + + # gqt-q + @staticmethod + def gptq_shuffle(qweights,g_idx,weight_bits): + return ixf_F.vllm_gptq_shuffle(qweights,g_idx) + + @staticmethod + def gptq_gemm(x, qweight, qzeros, scales, idx, status, weight_bits): + batch = x.shape[0] + if batch <= 8: + return ixf_F.quantized_linear(x,qweight,scales,"gptq",4,qzeros,None,group_size=128) + o_dtype_str = "fp16" if x.dtype == torch.half else "bf16" + deq_w = ixf_F.quantized_weight_dequant(qweight,scales,"gptq",o_dtype_str,4,qzeros,group_size=128) + return torch.matmul(x,deq_w) + + # squeezellm + @staticmethod + def squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table): + raise NotImplementedError() + + # marlin + @staticmethod + def marlin_gemm(x_2d, qweight, scales, workspace, size_m, size_n, size_k): + raise NotImplementedError() + + # moe + @staticmethod + def moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, + expert_ids, num_tokens_post_pad): + raise NotImplementedError() + + # smoothquant + @staticmethod + def quant(output,input,scale): + ixf_F.vllm_smooth_quant(output,input,scale) + return output + + @staticmethod + def dequant(output,x,scale,global_scale): + ixf_F.vllm_smooth_dequant(output,x,scale,global_scale) + return output + + @staticmethod + def dequant_add_residual(output,x,residual,scale,global_scale): + if isinstance(x,torch.Tensor): + ixf_F.vllm_smooth_dequant_add_residual(output,x,residual,scale,global_scale) + return output + + @staticmethod + def dequant_silu_and_mul_quant(output,x,gate_scale, up_scale, scale, temp = None): + ixf_F.vllm_smooth_dequant_silu_and_mul_quant(output,x,gate_scale, up_scale, scale, temp) + + @staticmethod + def rms_norm_quant(output, input, weight, epsilon): + return ixf_F.vllm_smooth_rms_norm_quant(output, input, weight, epsilon) + + @staticmethod + def fused_add_rms_norm_quant(output, input, residual, weight, epsilon): + ixf_F.vllm_smooth_fused_add_rms_norm_quant(output, input, residual, weight, epsilon) + + @staticmethod + def dequant_fused_add_rms_norm_quant(output, input, residual, weight, epsilon, scale, global_scale): + ixf_F.vllm_smooth_dequant_fused_add_rms_norm_quant(output, input, residual, weight, epsilon, scale, global_scale) + + @staticmethod + def dequant_rotary_embedding(positions, query, key, head_size, + cos_sin_cache, query_out, key_out, query_scale, key_scale, is_neox_style): + ixf_F.vllm_smooth_dequant_rotary_embedding_neox(positions, query, key, head_size, + cos_sin_cache, query_out, key_out, query_scale, key_scale, is_neox_style) + + @staticmethod + def linear_a8_w8_o32_(x, weight, output): + return ixf_F.linear_i8w8o32(x,weight,output) + + +class cache_ops(): + + @staticmethod + def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping): + ixf_F.vllm_cache_ops_reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping + ) + + @staticmethod + def copy_blocks(key_caches, value_caches, block_mapping): + ixf_F.vllm_copy_cache( + key_caches, value_caches, block_mapping + ) + + @staticmethod + def swap_blocks(src_key_cache, dst_key_cache, src_to_dst): + ixf_F.vllm_swap_blocks( + src_key_cache, dst_key_cache, src_to_dst + ) + +class custom_ar(): + + IS_INIT:bool = False + + @staticmethod + def is_init(): + return_status = custom_ar.IS_INIT + custom_ar.IS_INIT = True + return return_status + + @staticmethod + def init_cumtom_ar(): + if not is_initialized(get_default_comm_group()): + group = ixft.create_ixformer_group_from_pg() + ixformer.cuda.set_device(torch.cuda.current_device()) + cdist.update_default_comm_group(group) + cdist.ipc.init_communicator_by_nccl() + + @staticmethod + def all_reduce_reg(ptr,tensor,out = None): + raise NotImplementedError() + + @staticmethod + def all_reduce_unreg(ptr,tensor,buffer,out = None): + dtype = tensor.dtype + if torch.is_tensor(tensor): + dtype = torch_to_ixformer_dtype(dtype) + + if out is None: + out = tensor + cdist.ipc.allreduce( + tensor.data_ptr(), out.data_ptr(), dtype, tensor.numel(), ReduceOp.SUM + ) + return out + + @staticmethod + def dispose(): + ixformer.distributed.destroy_process_group() + + @staticmethod + def should_custom_ar(tensor:torch.Tensor, max_size, world_size, full_nvlink): + return cdist.ipc.should_custom_ar(tensor.numel(),tensor.element_size(),max_size,world_size) + +class cuda_utils(): + @staticmethod + def get_max_shared_memory_per_block_device_attribute(gpu): + return 100000000 diff --git a/vllm/__init__.py b/vllm/__init__.py new file mode 100644 index 0000000..96f1f26 --- /dev/null +++ b/vllm/__init__.py @@ -0,0 +1,29 @@ +"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" +import os + +# By default, to avoid memory fragmentation, disable UMD mempool +if os.getenv("UMD_ENABLEMEMPOOL") is None: + os.environ["UMD_ENABLEMEMPOOL"] = "0" +os.environ["NCCL_FORCESYNC_DISABLE"] = "1" + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.llm_engine import LLMEngine +from vllm.engine.ray_utils import initialize_cluster +from vllm.entrypoints.llm import LLM +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.sampling_params import SamplingParams + +__version__ = "0.3.3" + +__all__ = [ + "LLM", + "SamplingParams", + "RequestOutput", + "CompletionOutput", + "LLMEngine", + "EngineArgs", + "AsyncLLMEngine", + "AsyncEngineArgs", + "initialize_cluster", +] diff --git a/vllm/__pycache__/_C.cpython-310.pyc b/vllm/__pycache__/_C.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c973055cad6afe9f38441a8f414fc3c11eeb1b GIT binary patch literal 9011 zcmb7KO>7*=b?)xz`EL$?MN!m`wYFB?@$QlmWv`WZiw6(g@u}i-_0NYOY84Gr)mE|jr^~G#v?q@&jF++bzci~Th|FU ze8V>QZy_)ZP1k~=UCMgOcA0xBLDjDDyMkX2EZbt-1ioN57%%#Z!IHfcEZfW6F8M3L zs=dZ|*}oF3+v~daElpNr^%G52Ul{gPv}&@BRvoPkv@E%R)&g4B&}zsqNTzf&1|$!_CpnTO2naZ5AfzHBWQ|ImyA!lo=ZgJeF% zHO>E~vc`A?cwJhISAj3c2IDoa zP+hDp%CE>Jx%`QamFwx7>WX|%uF5rjXJN(_xy~~dfM1mxj5mN^lbeh$0>3VAFunx* zro6@YGVssI+l;ROe?xwr@m1hokarkg1OBGG%lH-GUzBe#z7G7WvMImBR9r>tZTSxO zYyf{(ZZUoh__n;q_$KiCa)Yv=kv zA7~c_N!e5FrM`*xjSs@kgRT4bwxf3Jz1wx%Bd6s>+s}PJ*cOksx~FOZ6KV2mB4)|xFM-Id z{FmlZtVlss9iV9_3x5);<&7y+4eF+)Q?NqV8o(7oRsqs)<>$=()@xcn<&M-ZpgS*( z-NcNY&H)&qKe0_(<+3j(6<)vN9bdx=yh+RGTeQhx$J~!9rgWXi-5_wuhLS3;vhPIk zO&W3?puaF$o!+~2!owfoB|LQE5iuh>abutj3KwR2>JvPzs2~y@XmZn3iQs2ao+o9y(-IvoJZYAaYBz*>!2U)_ z@mbGPr%B<+J58zwo+CvBJDV)HVFaurT*o~?6IPh!iYPvv*;cYN{hhZ#W<-Qr1yiql z+Fzr6P7m63NyLXg!bvk(PKb}w+Z>bz!BQ=5I0Hw$W8z9)D;DcMpSJr_g~8&*5=^+U`wGFt z(A?Q?{l6G3p6@i5nAdj2JL!t37escAxZ{Vdb__fYx+1mZipWu01hxs>BS36d_X!XS z)B^%vCh!#k?*Z5i=%a|9saP&JBIa(zBiefkm(*{C3B4 zRI`|x!1A-q2A0^kL`msc-&0{^*U8O^eaA(-GtG$#z9?E=5In~R**Ma1N`8bpNjhCA+KHrvLM(O)5(G1=`F&_DbUPR@n7KXR7fJ~_+0L$Q}xYFpj-nTQU0 z`}@AvEZCKdnUP&=iMBjR>M3)@9wc;{Oe;V6d-!PT_Be9Lce&(OhCdRGk$6Zn2HGK_ z_G{W%c~F+dk*?Nb{gC{i4l#j{Kx{M$ElF4~wh)_##Un%AlI4yrD`ypqE~R4!6&_7v z%V*WtJTGIFDs8-3-BomcuPOit60qr4#3l7L0`K$7{qFtuY;$k_{(H>|9Myr-^*EYQ zzlH9k$mS_2a_nrI2admQZwwDeii}IynI)ybi4mMNYb>IvAUvW^ku)R)Vr(gjofu)- zw|MnECw33)dYD>4_!DIpv6wiX=6RJFOOpEn6C*m|no-d=^)-D%xAZLLphKlGGcm_d zh}G|4B8!6}NoksP4epxuEX4`aYZqaP#J+9$VR+Q*riOl+n#Bly#p`?IRq1;_qE(*& z^jAqvr+O#kfMjAGHD%j85j%3MKz8*wR7VjM+OtdKRz#5f?PPzCrb(*@j)I|Jvi@>J z)*_LW>zS+N(+n!vl{oAkK__5XI=w)^gnJ4)L1x&lMWKS?b8ju}VSJTF#^Dj1qCjvF zi>@Q530(_9kIZ$eBgSs?$5@rkQmN42V45!(G^6+w!?EI%TN_6T7X=-fbE*1AK^Vpd zhz?*|n}swIpm;HLHj_a+1w|P46C#b2r(u|Sze>dM`x#jyJIU0H;@d2mr1*A*p4HTe zru1Zbln-^YsH5eFdyX$IizAbAHtny56fO~kb)qoqnQ>IZh&Q55$DogF&kI8>#)We; zby~ydcT90HskISSL4iB2r@n)kN$DdTG29I4qka35t)xM677@dBjY4?!1x!iV{Rf0i z@q8_#d}97jF)$4zjTIeRWC^)!;EZ^a{i8EccM0TjHv8^nMCbFrOh`&&(&46^81V+h z^mYX{B0ZHmSQ(Z7{DF~k7lR5B%DzMxS=Hx_$Jq^5*E zjsG*DAEqqX#JooDgDq$v5KJewWb@8mZ%Jo$NFY?gDz zXKHgKQ0g+}xc%!^{RL)ny+gC6?8nBCMblsY zTMW*UQHzpMmWO$R=X)Beg^by9QJ7*vBaPKX`{D7!@WBpT4C!cE5nvNIMecGo-gzjF z9}3J6e~C-8(wYC;5|>>>XZD4~)X`B{%e z;aA%YJXFRc7r(eBrAWTj4FXg;QEw19uG6L{QQgx|jB5;JC z!7!$-bTm_BK`K{2`7K;2X-rb&a>Y1hwk;GGVlgaPjCoA#O_pcY`A6CrMJy&*4RyFa z`sUWm*ty#9dN%H&>g|N7Uu3i3hTW{tf~2c9s=7qBl`FeM_CgvF&e=tsSK%SiKY{eu zDg7m)|H_#D+^D&fD`dARYQv}eE24# zBpaS}xUg@?MY;5eiFXawh!to=FN(uJIO<7K{@|(jVE2QMCP<|G$iVZMNF>7mb1wd#9q~x<2m8icCXJ*qDor zbV@HsPwzjmho}ZXnfQuc(;EoMa}4G6hl6)#bZ}l4Tu2JtSS2MCYLQyt)~Gk5R}IJa zVH4?^5Pw?#MI3zskpKMTkcr@#PGFP){Vo~<{ZKnIWBuGf!AMtDjLQED{mi6-T~AXh z1O1HaFXyPZoR=?9$YW@tg+q_4hc&t<7>uHTy81mLiBcrH3c*UZq9_YdRC!a!+yfHR zP1qG2l_O}|MWoK2uL?Av@bu&Fz|d12O#K-^Qrbho%2R4UZ}<~aa=w(?b*}@~{5M8M zba*AQ?|KE9VF?uz^Ha0X-yEOS^t1^r!0fR)l8GU8JU1rgx}ak8B=y{Vt$js$u!-TR z=&I25qA0zb%}LeBgdvA*N2nBdpJEXu0rIMu^)U}74w7_}G>P>y4bh3RfkwIsXab`C zI?jiRWA-OY(Lvbrr5JHR{W+GhEibKSrd3!oWqgI|)l`77YpBRz665v#9^|m|9HH5C z9Y(#tTBtv0$^nvR^~_f}AzGZa{bbe>b1w}2WKqEHcW`;bcQitzenDY?Iwf#M;QIuA zK!Bu9*#wA&%ec*k^Iy>!QN&l%EQ13rav5a`jqq5&3+pS%DeztXXp@x?^=O1DRIU+~$ zto{NWNDfnH1D6cj$nTohVc1UjI*B}7Y9dy-y1LcrATjN@>HmaCj84a{joXUuZ&}Ce zm7zx%4Ug>VaAcI$V(OzY;xQW&MY_V$Z*S!Kw(|eMV(gmi+oLgbhXrPCr8MHM3uYPj zXNnA{iU91l=8R>VM*qWN6ppe1q!^#WlGJ|_r!He*Qh7hkVZKhH5>d`?X%!THDu#uM Uuca?BY!tNbHGZ?Pw(|D>0rt6JZvX%Q literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/__init__.cpython-310.pyc b/vllm/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84f6784d1764e73444efec533a9ea1a040966d6e GIT binary patch literal 901 zcmY*WO>fgM7>?8LH0`?2(~xrQVacXm5SkF9>m;OpOshEH16D}f)LwAMeD2q8>|VNK*w#`Vr|$qyhA(8fsWx0 z?J^g-hIeU?^Y^{bcH!iBkbWcAu0A&8TV8J@ki;j=J4?C z>=sStH?OCY*<^OVSWHWBS%NEgHXo0t=x*^ao-E(Z$LRLWQYQ`V^58~n(`aJa-S9+| z&VN1xW#4S85!I?%>FftbBqfrBlssY-(LzYfRZ2~zsMdpV#&U`!$uR(kcr2}r9bjV!w&B#J>1pM)u6ANw=^^~G&R&Uv=t1`$`(R#79mvH znJAkHmN?y%Ed_*UDLma!%KXYQKjaB2B%z{ep!>n|CkfgRq$*HVVWz>e{gk~;eW<4c zc(~mS(1(L3yF$sgc}7wxd{u4qLDy_$#v4&-)M|GnVrLD?jW@=PvWw6LbD=C%VwLKU zQiOh5{8-f)`1?F41P^Eyg*2e#Ly&LfI!j0X)ny=v#20xOeF_sS0zEuHNZ%kre!eZ+ i*DQ+*ieCcNOq?sITW!#`BOG z0f7#>PVzvHz%GShV&#MfCJmRh6oHPK3Z!Y6ZNathfqXBH`MD%*@I=|VQbfrI;?}UR z=3;z)MuG%!>Z*EKH6oBlr8HsM3N{(L^=3Of*g5EGm9`Hlgg*8~w6dTDSYI#bfn3e4 s*TRqIt*Jp;6X@Q5S;S*@*S+c1*^%_;W^kn~Zt%K&=3!^UlprmNB{r; literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/block.cpython-310.pyc b/vllm/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fc582a0cfb99fe167ccc4b8228680665a755592 GIT binary patch literal 2749 zcma)8Pmdcl6t_LjB$-XJ-R*Yiwv-MnAcESYfGRk&s4eAhtAOZ=LoTD)v6IYZG862X zij9^FZTSj4AS5{U8}Ln#=E|vOZlzG(^CX+yCM)5{Z~W|M`}dxI_DfQ)I|RzP$A9!6 zmkIe3mBYmXBxVG-=N7hB}3>eOulS1Nrm?lmN#L7aNot zQ1wqBB&-J1MZHq48i(Bh1-KVQdWVeCq~&Fpw={ zl-JC;cyTt`kt%lzO@|Yi+e)UHiaP-kV^*-upneQhUjVUB_lZaL8AzJ4KHX;<1lpFs zelEy8@`QC11BeQ`Ulnz(qO(76L(ydq3ZE9Y89 zz4^>1L^WW6XII;qR&5BrAGM=!r#;?F2TA;X>)k7D9j5Z~*bjz&U+VV#C>phmTdnb4 zUh%vz4pYxN0gLDg2)ks{8guC6%)+s@X2t>HpA~E$gaEJ9KG}u)F=Y3Ms)E|2ss^7Y zRMp{Q&=z79)VvaEPmadv-dWHzuCsjWnc4AM<0iN@RC*fYOgl|xqHFe7YcT%u?kD>) z#j`FTcsfc_{TkT)g+OX_^5$X;mdyBnm`p%U!n%M8u@=ylp(=y3y)24eN2^mHCR>X* zrZ#E?#sh>qD@aKsmZ9XQ&`5!l4}gBZ(%;CCYUfMxBWSzq2Zm(bwScUwuz7_%V%H#m z97cOex^)_bm_0k(#3 z8`P$o@SB`p^zq1yg(#z=#!2KZp1d$;|A=iJz-1;p$De;4PxIiWy@QxAlqZ&=IbwS} z8sv{bLx)_Kr@_WR6Twas#MZQm7vQOR2BvJl;4S6o8txnUdJgn5;^ZJ26p&tAjN8JD zW&>|}?2n}9<&Ni#5|KsNu6f>fnIBC@%AO~Z0J^O(P8EhV=Q~LfsdWTr4t*0_7$bEW z#TE*@x`$31-y?;WTa`fI6JYLCg>cGj#@N5iV*fDiJa4QttcJB|t=sFCu?@iXNWll` z22}kah$RURQ(Or%Z?9r9F7Mi+B+HPns=|R3RS`CXdnI50diwZpW45_va@3V?X6ZxE z^9QC6f10~qP7t{*Om3b(z6$Ym0z8UY!8V4+5c5@N98L)P79;^k*d`$$NzMDn)kUzI zubNkt?0G?w#c7ilH|waup;((R<3U}8TD;%34L8bRle3FHAOC=35;BmHy$EoZ{0w}B zFNH6_-evIRzmy@IrOxEc;@WQ)!MQ$Jy;Q_+nnSNoYM0ythps=PO?WQm{@^>Xl=i?^ z1qbALH_I$?=ij=_RXroxT|w0&>eLig@wXJVR&unRH?U6HFTW|6VVGW z@E%l+Bsatb-p+{2p0kqk_j*mCRR^}i^>aJj8^d1>b2qvUX4zJjhLJ8dM*N22W>VKN i6x{ZABDrk;ymGC$p&#K3p%EO1bW~%U!HsFwHva?lzh035 literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/config.cpython-310.pyc b/vllm/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..239477cd4a8fc0ef3289baec2ae27996d06e9ced GIT binary patch literal 21036 zcmbt+*^?VbnjcUooQ+1GY@Q-Z)QqUsP@qUkvX<6CsYz-MvB}|~v9U9YsYVsL(Zt29 zLQ@T+W3)?p_Hb;3*E_ot_EbwO3k8^zZj& z6;MEB$2 zXeBGjmQgWUsY)uzI-_Z}(v`I2Q_V~zgF3UBZRIMtur6Q8qb}Vnv}P(ZQkQ8KTeFo} z$!D9T)?8&y^10@GYoW5xTC6O#mMTlF<;rq0;m*2;9wsX*PQG&ZorF_xW*#M+nTJN@ z2uej~7NuE~j-pg@=1`hL=@?4$&H_pcC>=*>(OE)i38fQm8s%l@5Xy%h8W{U2H@|%{ z{G7r!KbEud_tTZ9ox{$NO`~$!IqJ?h$6UjG25n!d8%Gn)ami(z6V6kQ%*t1tljw2E zO}fuY+q3nIV>wTw<&2ZMomf5n30HPC736Mo{YIy4H-q%uHZnnOt!aDSd$w9l26@N! z?HbGLC}rPsm4`0K&fiq7uk1$Kb=ErVdSerHGmUn`uQofIn=YR%Y`T86rW?(mB&BYz z+HzYRwHss_UUjR{4f6PIyFG;$A7YpGPOVn0^&GppqhHnln3rn=HvacC3YYQmUIi++ ziAvH*R17CsN#%{@bBQ1t?OO8>uWCPQc8RxvS;vB{@7m+*S97S#xxns_88cS?2}Rti)={&Nx;vx2#TG9@^?{ZZ_JRb-NZ0VEG-Z=egzK!+z(U z+ivt-_2mN$yB<79m9Sj0?nHwMY-Q`T-Sa!Atp|-}lW!N$>bB=&pz!Thqdo+QbFz2r zM$_JCx@RrB?O3P1X6HdTfZcpx?|Q>=*fJU?HZj%n{Hg+~`!3FafW<0HS6QsGo^#8a z<+D~@bz0Wf^;%uSTCG|Sw%qm@Ku+gDyVU~SDgmd^-4pzH6JIA z>lQW1qa091d*C)Uw|wuc<+ydb*W|*mejI|Zsb<%J4j4Hc6k8qJ<`EDaybA1T**4xxcl;5ajOHzncl{7s4gz@pizh~i!SE$R#d?Nw zm>8}JmeDn8h1U}L7NdkZ|Lxyz|qUDc^Knq#kY{M{}{ zz!h*Ui&^P6j%=KsUGp0|LgBo!U}uPI9btuIdp#r@*KVGF1+R2#4YEKP_}a~j7vv~O zo4f{~^965Ti+3IykdRBW?mt>C4{LeJx~12F1$l>ZF-@meYiwYu{_d!|a(4t0eY~&} zwMM;B)AZO1c_$H_1lRh8k4Q#n zb*tA7U*bC0pExxQ>BiV#Ap|3MN%BAurve>Shw#e8hipd^%T9Z2_>f*&B$yzc8wXDj z^O%^{*g*f6bBtDYG_N5AK4@~o4$-sIc4Z8v_zvv{{}+IGY&0%pMA`T4O3Qv<*k-fZ zblaD#>-PJNRxBW(6&g3QQ+p&_>a{a9iIhE~O4{YJRD0Qsx5lu_v zFpW(+WG-=7fc>7=gP<}5V90`OUXL&yX*H9XlaBJVLMR+ zClZK+-Q3uX&B7avN7%4VZ6NOud$zE3u`!5VO7;4>T9B4)5Awq!7LGJzgTkCz9@kCNWC!93oc9^{?uZ>RPa+QuNcw=h`1I0;Ej zNt6DeL1H@rsPcotL&CP=A4bi}Ap4L|AMuY$sVJpmgT&+GA0?6re4hxvp9;TE^4m#3 zSIMp4sVu$s?z`8kcW+;btksZrr>Yq~!<%CU)9Y&;}FGJr}&T-SxIQel^<{cnQId)NA+XqyBO7qS}qi8 zb&-dpS^tuCBYaaAkDyJgpe#eBp!bN2@o_?(N4L*jIoI>lxn`$kH_tU28|NUdZgtuh z%jaJ_=QVuyg)Ss|kOuGEPP5rM2NDBoM7g^gIMNL1g#DBd+BnZ-c7;AQA`z+ zX41gF+!v;)dld{LnR!w&GN{cw$r-t1E?G?Wm#<^3LY=uBy`)CWi~e|8`AOm9{Rjz9 z!{g*fiM`||%vOJx+%q;4zsYlK>?G9@-*9j$wi7=|IH?a)50k%^^ivNL>cSxT8;Ko! z|9YaHbYR7Jlp7dV5)YGmDLj!GB+)-RNc|-7p()ScoNdFtv6*<7+B2Q}z{Imx2T7-Z z^L!Og%&?cH9de?I>#OO&q;?z_UB5CDQza_dP;CH7qyXPPbhmx44J}j@HOgnA*!8cJ zt?N{4!QsK~sokC7cxTR9XSzPq*qAJaqaq6d^Nb9AraZQE23E_&hlkJOY1_o&eJ{5M zz_A!f1%mTC#(fws1UV#GH3FmjVvwwVoUEKwom#63PG0qT-EK$ut^9SjUVz|MI@O7Hacm1!R_u}9NXsb+{MN!za}{8y@5K}>D%#g|oFSum&Pmhj`3*{GLjXpMVD){V~@@h(R zA}+al4GG4Ug%$oJ3nYmw&h7GMaE$Vg*rK&q9&N7ewWAIDGV zRg&_-rm-h<%1QK+YAISB@_wK7DbOg=p41>EJ=T2_x%5#`ETMho09IJgqL)auK$~7d zYxXF<9ndR|hNlwhb&+KfZO9aqAnv8qx(}*0FhT7g%?On&e3%wmrk_cwN79!&w%l3t|`VR8puQSQcR(m z^Q4+7`@#Ld-0b@ID+Legk=u8{e1c5NRxn6_l(sq@e@nQidIQU;E+eTd#o21e+ACUA zgjt+gj(U^*=3x`pBZY!#|Khi&XhcJGJhFqe$d@Q&g9BR6^>44j;OA=c19A|PjQcgPfiWVwHS{LP?FwTN~d zKp^|y-S-Gpw0f@z0}xF$Em)EqYuEL)5eKtCF|>2=6h*p4yVHIl1Qi`*mAvw5UJD>{ zz+UGbk$eRy4}^^8P1ClFc4q|H2exX%)D)7p%F2j(#TTJ+Y6|}u>o5nQPAa1PA60}X zFmbj?h<=KXcLxb2kOBx_2?TK7$WbzRk}-@gOprmgknoZ8nW@a@Ab=UxXaZ?I$&hG1 zF^qzcMice9C+442;mPIc6wO4a0qc_@-^;oAIJpjz zMp0SIrKwh<{c%#0w22v?L#Ka(k4JAH=aqqER(R4sIStN{Od8B}ZScsZ+l-{<(#^19 zO^!s3vPJYqA~z&-4ho#cHUzUla@zn2GX|*@kUkKEBYq0vnL*)h|jhO!j;@k z@1;Ss(qaD>(H~{>%nZ__{z5#m>!P9bpT5)a?Iz8s8$eQsP2it6UoI1beCro9Vi8U{Ugz@Xhm*9GBn zCx->&ms7C>NUK8_s{&)NtU;+ppi~KpoSk5S{q@VM`sKk&^p5fVc!RLDZvg(vQJ>3F zo$@mEF4}{`qxP}E)jC>70-;C^!SPUJqj`DsPE8&EE&6+>k>nui7jZ}^J!Bv~*;-VcZ%@!0N9Niuag&i^(*= zX9aK$z`6Y~x_KlDGXewtPuX1=2*sAUn6o?9qjh^p%P_L;NeHTMb z9_JOJz=)nju>T3VdMA5D?jBtti0e#n=AA7tXW^tfO@eW)>hM;sp#Bk(ukr*Gn`zAc|srGz9nq0m&2Gj!%Rcjd(uBKc_8B5_f3;DNX`VUc={|5^Ys)X z8kdq?SJ%-+xJMjnb%V9zQi%>=;d?bddx}7EK`D0ar+1@{tJ!t_U0eUyP~6w`k4;5p zr@qNVYF3ZXSVf0iwHg$v#IjzK`C^r_Vl!;XR$Qts#k~#E;>*@1 z!BnH|tBY)+#;m9wD$0?HqLsSIjLYizNOB2S z%wE+pXDE68%z!j%q>RrE)A-*;+W5;%#`w(08lM((#$O;b=YM4i#{bOB7=I4z{osd~vyyl7E@S(<=o<2x|C@{mr6u z86R&8$#^)BbI3^{)W<}?jt&FLAPfj$KM3Z@A#kVYrt`q-t7>K*VYJcXjX$9yfD!erP=?j8-LBnLfp!Faj)?iM>O88%Q80t40#xB@^B6k+BaHK;K5<0avvd+iAuarnUIA>V{PnT=MawIViU#dDBefWylxs@YJ zI`9Ii70+!ZmjS(^d7GRhlLGBpUK`=?baOTy9;tlGC_s!g9tmlsVTUGZ@Qu{^m~=v- zMCpW5okx3r0}p&J|6OBGlq`r@u)F;3$dtR96uJgF{8!L9NP&F^Y2n`!6N2T|Y1Fjk zc8&AmrXEZ5&&KBgU0OA29y|c`PRvj{d=NHckX#l)X49~=w!Eo=S}{J*ThZH=jsB5? z-Zr)fT)83FwNM~Cf>vp&68P?f(O)z~bA1!t@b|Mf`xj#8Hg>j$dSTP`D0snpdcOQ( z*?I@Y5kS6{wYlIXo-%5P>bAp zKUKa^@6VItTHH%;dm>c-6{*o085woR{(<)}kwLG=LPtVCjER>w>+<(s-|Vm4=^!kV z5gPG!YxD2@UH<)tH~Xhp5;^e$-rrD{Rw+=HYEl?mlH)o4Zffea0lCE7E`agnr&)a*~AO-Uqkh{V4P3kg2E^|9$J1F z+n$4{oEOoUmmAWREW}`5Y%n1tQ$d^;)y4h;@ZIU+E+U*l1=xwplOj;QH5NTS-6n`= z@S(UH^%z+uMRkt}z|*Xd@ew8srjyS<=%{-@E)dbGvUkS0RMH>>ei3~jN891W&uV~C8hY*>=H^)3Lzkpit0frN)k-~AF!g1)8HN59h zp9)vR#F{<@M+{a`SfWXG*hWlSC{{loYfG~YEr$2naD+Ev?U^_$6?O{u3{6`7=WkB> zLmFOdzgS7no)QqhfVC&%#!2Kl9Gm&9!9fXbvc!<&m5!B(WxYhE4+m@hK9NQKI zO_&3CfOS6^7-1}(CL(~#5DNs?23Tl8N%fGq873kY>@jzc*78%L-t*`YYAsKQaP$u{ zWT%kQIrGVUM(1)*a$oe1Pa2`8V%zrTqtJ^}m+ zGQz)?A!3ek@C;01Fm1skR`|4ZTo~y-o=vIgGCm$fr8v)a4BS>jJOvC|>Cm8sC`~6@ zFTj+Q2Y=3hPm^bx;MrMl>e*oKHi$i~i7GTweGK@9bz!VPXs()Sog$T_5T@CN1B*2- zO-X5&1K?*jXo`i3Z$82Zq@msruj zV{bd)CBl#XUn1Ve2B(^|>9#oy%#*>>F`Jj>E)Lo9)VXwR-4;#-C5{13s~NOjlwf80 z@(0DJo=c@2>QWVrs_n`(+FznS=2Q5r;p1IGf|w@8@#HkOYG-`J5<8*CKE|gagIMBm zddPo_^-qA`6yEUBlPYE< zFYbBzg7#rUjGSvyj>vtCF{z*l0jHMsB~<(oy8KH5asnBl;zfqo#GM*Ortn#5QL$$Y zpRvC*N|*7GV8tQzhCbz{99X^TQ1G1$v?@9zvQF+1RDIBG6V$p86t9RqL!-GExPpu} zafAn&bBO+!A0Frp6d0rkliLv=2>#gkh{C6co%fIX6q}Ys^hu!v<{`orgLP=5K)i~W z-eB!^AQVv`CBpkCvvpJhUzR|n7>yZZw5-{GY902iVJxeL7l_2@Rr33!`djRsg5D0^ zzm2>+UMV8VZFti*h)g-+f2BMd6mRtq5z`7IO@9kjuxnB?R?JV6OaB{pjrM8y@Nrd$0!*6m9aMt zFS=uhm(T%=F@dIhtsrNR1OJ4@!{61S{fJmfPAKMYqVpf|Wgx^r(`Cb0E+rLf_n{=Q zW zP^KR(fy?eQZmDjB27@`LY&e|$Fu0c1H#ZS81Tk=AhH!4-I|;vuOs_WLX<3` z<=$jHH9Hd!@MDN73mv@jy#VPK}X3;KG+CKnPUT$G`iXcz{N&5ee5<8`an zEV&AS;CBe4aDv}w?(Z=9T_ll}cd@IjiY_8x-xt8W%E@8f)E%E71;K2!iU<7g_Qx!i z!sCfC4cdT5@sNd7JimxGS_VgahrW;zqJm(=28947(IU0|frUK#Ja9bB9u*qwA+QSs zwm_pWb{zVujQkZ~sLCwO@x6oiB{q zgeWQSdE%K^&8Z&&4jmd$KqMi;`f&vqDgkRH=uD{caTd8X5uN=5R*Yy57cn>zHsKM z0_5*r(MqexSNf`VWE&4vbXi8ZE2*tYXAcsKqA<+YUbkL(`Q;1no6wyU?SfW(!RHJu z>ckiDBfvSifG6S$FgfL~aS_RQBK4ML>H`*U^4ZnJNq$0MCzz|^-WYsVdJPag!qcX; zS-}d~dby}22lB^kHJNooj2LmFzm0|zfhYSdAaVzRY_PHyB}%z; zXfdzYV-;f0mffreS+SF!e?|QOwPESvg&?;PuUG$&jTwE$1bemZjc5@4QQFGlBWRHU z0c7}|j5i&NK%)fSJI)6@>w`!&;z+=@SWL9-M=FOg2*g}aZlb{6$0yZyQtxjHM z6i{Ox{LxjbtTf@0)%joSl9hDVR<3?-4L<-w`*q)>=Aymkr5G(F>ZzYH5hD34bN>Jd zW_;>l<9SH;^f2DC1JnWei#_mBEDolHDR|OJa$|!agaOba3WOP!wE}DtwAGedG(% zP#jAK!BOKHBetYK6fxvfosd%n%YfWkbd0e`QSdXkDH&VJIMEdN)^YGQV~~D`6L-G= zoW*H`&)-n5;MqLob*8<18{((b{0eGjqz2s8DT=z2cV+>{Hd;z(A(Xja z5?3w8PpJ2C=94~7BZFCB6EVPAehbge$@!d@Q2qruZHpgfIbPIvb*caSJMaQg8P~L3 zY@fm#&v;}iv7b~Z2Z+t9ujVR;Dc(S$z%Qb7+Eo|PUD_U0nQ0HJJw&~t7xZb`s&Lat zXK;1CGOt-Lo6*FYrLe2~Rzp--iapOx*>H$;^&8mUe*X@Pv=N{hC(GL&tkJ`JL#Xue ziz}EMuH0zXa=ilwGGf*@;f=vhA>sH?*HgI9p^d&B)xgg_!6blp$7UWqF0aEaAs9p6 z;;%7qAlix}FeaXYxkOH9P^3mw4MPvt{qNmTjG3 z`x&`nH|&0fb0`PqE^hZ!(4-+3(R0e)^iSUPT=l}}wiH)aZv=l8LGEk--Z29SrxyMA z#YFedlCSrF#NfVY>aiV&OhpXvz)ejt4Z?hbE0?tJ$OClu-@>U9~!aADKW^WiI?-5_yhy*8e%Xl)_y`Ks7g6{R_Tl zBy|W2`5Nl(lM?<4GC70^=jIqa4=VQozai{H+((E9?t~h08OlsE0Dx^mE!h*HAGq-W zH9)>WJBG?aE?iS0{FaE1X${JKUYKQyP1`5?8qw`+I~}xqn8uFX&P2 z)QJN@XDhrmr*WOLw7~)Tq-r&tMH(ulHkVLDKFFWO6kQIZl zR~G$jFqWy#$|CHSGCb~d_P728_ej@#@&bh9;X!xz$=>s)J4xqYZ|_NW&(K?w`>l;! z9E-e2hays8Ya>-Xoh|u(mW#!~T~NykQIrw?1(b$DsdNB8Oq#9XW{Zcd$9G%%cUy-& zG7v@|_Rxp(fjnmKAxw1@h$csL#-?OS zr)=&3&g781qut0c)ulyKyKu$$Ns<+rP7*kGmB{C08-Vs@cJO8`W9xK_- z%U+td^X#|w=&gQP7Pq%<{m@pK7Mr8A_X_Y+?c+Qjg5F>t5PW-dx|dg1^NkaK4WX}C_V5vz%1 zRp}aHXfR=oKvK9Y@+4j^p%IAjptc|_rnZr#%t5vS#(9=#`QiXQ&3I zGD@XZ7McZV;C*XAFarvEtw{q~r&p;?Mgs)LOb^9aAK&3kX~jrb6I-;?jWC9f`ZKi>c`WcQFtPJhCYN3XCQ* zsu`k5iFP>|6B%kiReML(IZa($k9_SWFs&xEH)xVspk2$Y=(=uj`Ug9K|7Xo9!ClTY zNmj88Bc~ftxdN7UqcsRnmZeS>#bSIVbO#&P=EhVW?fm?F&#Wa}^iybpxV mz2wYAjT`^oqDSr{stQANnNRUsV=i;)x)U^A>O1bb2mJq!lcl@> literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/outputs.cpython-310.pyc b/vllm/__pycache__/outputs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..597cfd9ec5d4944bd885383667db9f8c9585b90b GIT binary patch literal 4871 zcmbVQ-EZ8+5$88~yic8EJ8GSl3Hsq{WhrUe6mWB{gBT48C`w@kXh5K4soRx~GVgfa z<=N4J_hbjnOP>1>1dxEf=r1b3*FNn!zoaPtL}u~Pfw=`g{udxbA!@7)R9_uvzNR9t#k#LUtH;K`^v!|gTLas- z2afLyDt<-GF=BU6^{cA#zCui5JyMAEQ1ffR*~9_P0j^H8U8Pg`7wn>RwA|Q^c$zyO zj?yR@hVf3v%o~4T$zYUjCkG>z?D4#@8xBS>o$>WudT&gJJ^DUN#v>Rm*xfWt$1wd4 z@;v>3rY!35j+$4ulRNJgZ+p1!;C?7zjNkt=)8851x0~0ly)7 z$VEd!Ke^$dK+#1}>0Ej=6x>V#F2N_IpQLit0Uc5nrj(Rh;?U+yewy5+!yqDD&K9iq z``f(S=LO4qd~;*KeNjy;SqYwi{xRJ8+ak3r8g*SH$H@&&Q+UZy?a4kAx>5Kx9Fk(S)9R@ibo2weC@oz&jzBP2?xnUUJ7^d^=LXXLn zr`pa=r-?ovh65S|xf=w7gp6bC*Mi{QIE;%CI|xY9gV%aAOmm&5jGQjzwA;E^+k>@d9*nJ9@Z%YfC6rq&P|g691Dum6M;&Nw;3d+zfO0mi zBFaHqgSHB2*CsVm2ehlx25HbHyi@iKiE}&J87>j2Y4JI z?GoNeVe`~4`7)(`Cj>##mRZyrQHCf4+R6Jf7ggjUxp4t>C2q_>Y{}9x6Ws9r*r$N} zOq}F{eiF~1>SE9t$l~dSS5#46$a&I>L@DqfO8Y|2(sql;AT|BJp~{`RHMiuEva&zl6@q<5cDVTG@zJB5j{K>?g3E z{S?VHBpoD}rsYllIrao-Rb*y=$?4}417?uh@Z*SrR<#5mYyk=`-;rg4yje2@$}12l z!2wpwJ+!xcP#?@Rn5FnvKt2ONI8>+FXUeg5qD*zc>umL}%DzfeqCIk^#?T--^pvTQ z8i3_c924_%eQIVV0fhflTUVymfikr->$dWz&eX}ALvvaYk^tSu3i}mNrj^t_#*xa2 zI(1Vw(~ql}bE3kVTIL+rPZXdHpq0#hXick`eca5dC#tI4y9Sb4(;Co~X&q>L+5l}= zLEC25gj=+Bymq1tp@^Th;Qghng{v~O>~dDi>RAKoj3#hb;P(o#K6e1p?42c~#3R9+ zDWE=6j!}SmsGZDkTK6v<&@|Yi;UM5NWW9dCdkLd?ljGM4B$HqivT(rtMofnXX+IcI z_`XSxPE~*jdJhmD0ZRH8bAIl;6NiI665jkX?rQ-t6bpcBvU%34`;Z@C%fg|X?}%!K z--M1_zGMr!d>3V6U52)tlwk|sMwiiwWprj+F!y}${Q};FCP$&UhnxW zUdi=d%%uz8MqQy21uLs)6#5>HMKY8~`(60J2!^6B)506DO4#Z*z(n4MiSn}`x1{># z^pqK`sKsHj&`Q(}Z=xg-LXESr@i7c=%odLBsE*oD9lfE}R0B8*TcZZ=EEs{>;(5)m zbW9_T+JaPK!MpMgswdTYGP(NS!f+|*Fc3su{T+1v_fiu|P76v-$g+nvw3RPc;GViW zll3!7Tw^ld0rqz%m5l;AyOY(jJ4fv9q&<5%uR7h0;-is^ao>kpBS4MYX5#%&?57hV49APS~;lWmjQor_El)Jx~n$ArcSC z>qzhdve%GcqGART%;fAxNbsI76r9yw4CoE!^b^7`udv@D`Of0=$1sX9%@Go3#pmP2 z;?pXMPpv3Et%~+obKS4BwKcW9!B9e{kz47}2rh%vRw()#81U=(akUBeMf?*-DsRGo z)D3UGu)tpt^YG|53qMPvgrT^_LVgk#p-7{$RKJOzS{y$hTZ*Gz)oNNpZ(sipVZQCT literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/prefix.cpython-310.pyc b/vllm/__pycache__/prefix.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a436e53a38200b9d63350d926194ff54167fa718 GIT binary patch literal 3451 zcmd58O>YxNbZ7R%&iW&tq=W+9ihfxI8-&yYiV!K3ey9Z0E%=X9`$L@^v#~-TRq#i3GUIv>6QEvF}^mK$*e;Zf_6z3D3(9`zp@TFlmt7{W8g- z-R*EE;Sti#ehwgaV8~wpaKk6e@F^p{$tbtDy#~^7n_0|mJIrCFL(6w~iCMhdrmW27 z4y^+Ma=}T!T~>j87qALa!b+9ZV5K_Vud{i;YGZ7HEdo|&OMG76Z?R>z0xJv51})F! z&LhFw@$>LLF^qem3cWC8o>ZaY(hH@`q8O&kdlsv%7kYht*vr~5%X)p~sce_0(rahJ z>kSeWgHc=$J~=j&*AdyEFPpAj-F&$H)h%zk8v_Lo)j)6$ru=!Ii@3*A6(*W}FWmEX zxF>i&38N|Vfy#QJiX(j!@)!#fg;6(7J6@c6s>{9h0Ph_$+z=gkYf7Mn4Pquy242YS z0gCz{X>nn5dVU8T8pwFa3j&RMh)*c^a~=)QKD~_bq@c{_rhIbu=!)DC93)FOBMjKd zhjR+!E)01efHFqpfQ*cQj3_{=%rR~$dpxs8#$#(lS0U7QseJ!C<0l9wIO{N^C@}*Z zYiW4reE@^TQw zX{>?(Z;^EXuI&&PeoF-Zhx3nggf%DUBGh8?;e2W=R0RwHSv??I#w)VH|O}UgOf?!kFE3^K5=qpg?{w$OSO5I0p+2Do_b+SMeUsP_Y@d z^$u4-I_L%3!14<0YTi{cJU8Zj{KVM;pmK~bdZ@*?XB>dq;OAoms1}CQ7V<$+I&k<$ zd!LNRW8+u)go50-+#r6na5>1f!$pJUw0S+Z_1Z}t-1P*C9|pzkPw`%4vSbXe&d5J| zmu#-v3LG9eVJKy<2w7E%9W^q)+aERf;(vzQ!H8v_FeX3<|*Ytc~?7m>n6r+LRk z-bErbkQ6m@$Z&CnX|a1IH*pzAiclb89_Od=fw0Fk3xcb7!UHf9=*5nX-3SGkd^pFU zO9MvPh`caff@%Sz`$RdKTT#~E)6@`*I`T14PWo_OPOYqyyai{pBkN?iI#&0@k;WpX zYZ~@=5aezUK#3b9IIjl5lR=n_cS=FPvM30I1y|-~oGP(`vodrDsOxGkw|25D5$92= zH3S%-;t~Q(Me!km5`rrTjuJ@EFnL880J>Fr0Lq3%XvLxQUux3V)N=o!wFOc)m&`i+ zR!x06f`i9CWN-(D^Z}gg!1xwuL-&O@faTk)#2s$wo?OCz*wH=MFHd(|Zb8SM<1Xwv ztiq~5S78<;WG#0Kb3Ov4!)=Jl2Vsh((1RK??lQV__I07-|C82OB;dQWI^88VloIg{ z^op4-J5lVGx8Y&_W3N2Y9UmL0G&XSj-kmX06$Q5fuG~C3XD|<9%ooUN0Pfp`w@UiXUFz|O=QK`r;3iS zchjHgny%+XB{4+}&KEjj+i93ZVq*)>-IPBY*Q^tw=F5Q+gA|)wFnxl23p85v6$0h- z@~r4@AI>XDCPHY@3SLRa-YGU*gBodV9>HW}ya2PlB>O}`b2%XR77;ZpOsEs+XCBaf zQ&}T(MBDU$>{}xXR_&)mT!A&1jjVOF1N^T6Rf85UmWCsmaD8C`%=^L&y0l{TY}iM( z5Lm@>r*NfTDgJe(J!zT6kg*^W0oMEC*aIMy7$GjL5(`FEKSRL17iVnr7FnYz(3j$4 z1lXtqwnXt60*n-01qE8GPy?%$E=b}A&Q2?CpTovY81e!DUEFF^7dQ9yg|cf=Lf<%~ z{zl-r&4!yhYOfFPU4gx!sHGTY1y5w>yZGp8>clGQsx~8%pT<8guNRg5Hg-^nXD=F6 I3ib5be^90ko&W#< literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/sampling_params.cpython-310.pyc b/vllm/__pycache__/sampling_params.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55a03dbfd2be98a5cbae22c7eb9ce86ee05a353a GIT binary patch literal 11436 zcmcgyU2x;pbp}9?1SyI7U1@jiwc)j6FKsQgvg5>&t+hAyuDg?6dB@r~Ei*0!5myu_ z5TF5Y*W$1(GJWoArmuZ#p4!17x4A(2M`jp zBGnaBuIEgpu9~@e-c;)aGY_c?;p9;2oBJ4;dcMm*AuI>5Q7wjAqv2pijYe2*G&-)? zvk9MUG~VbLc6_Js@~f9Gzw~Ee;nhnUFTHX-oQQrKn`jNyuV21&{ri$P4~7gretxCr zdn>lvH0%}I+Ft4I2fMEGRBiRbif;wteAj6187<+j+_LS?iXW@C!K(FZ-F;63t4Vx) zvJ_2HW%_6`w;$5TDrjCH;%6|qQH_+8Qm&-pljlzXSguqAUc@h9#gfKY34b~BNwX|e zb3){WdP*`UMZuiHw}S69zBBmF;{F_d=VO`$Q8Z8BdJ(o%6eY4Ca|!n}Q4(cQIwesH z+D@Vkycs`*KBw`$FXnN7{Cg(;eE|0!#8>A&6Wr$^Gq-cWJZnCDH(#GLpE4gYzjRlv zPXT+>d<<9x*g3NbY#P|(<`ckX#H{&{nA=gzW%K-9^^RPhH@|Gw%oW^Qz#T|yUNBd2 zb;8WSTAmIoLuMJW_RhvRsGy<5On>um6fh-1aP=c4L;-;Rt>Z(rrY#ajIMRw z6Fb5aPE)L;Ec}(GCyYQ;Ys~DbqZ_7a5p$N$?{&Mb7wFr<=;*#Myyh<2cdh2GZuz?7 z26{{(OnrM_kHoA+^StD>{59rEvv$_>jb3M4c)GizyS<>>LtpVmkL2<7z|}nw^gJgD z7f?sbx+NSP!JyL(BH4(`w(x_7yYtz+c8x$cJg|rY4n*4->T#SO@w^h>^o70^^WREE zTW0+SZdc!OMw)qmNMvEA@)5t3NvG$N>huVMPgiJJcPeccp0M$ZnGQ%N!mLF$WO zIDB%e>j|G0qd}Hq2m5RK72Aa-A#|W~o7KlkIRfVD?g_`&w+$bUD#uuD^hM6Zw74a&j z)2HTuUH=QrKQ@8PIAjiECJ_0bZaC&WS;T6F-x1bv4=Pzrtt!I}(&>p>k5Ne=I!Fl& z1SXMjhoKr!NkLC+>RrBpF$h83`2QMzvkW+S+j^OwPWm!7&!z%yfP_a>R1j^ z9jiIC>|Luxt~Ju_m{~B3NS2yj7oMwkgaI!_@Db*IDpS%OF|E-(ep=05r-$7HVVI)u zwy|y5RZGn&1fA-W4xK9qKtCovBqUP|M>$$|6^K#ZE$D|qUELq>hwAprBIG* zL~|e+wnb*(;Y5YGj9o6)DagoFj!|*6QSYo0zDYX|>}o`WlmJ_fz2ER*7hRl1*7O%gy>-}3))XZ= ztbi+nDY1!6ix zXazd*(Cl{Ia}u-DcPzx{Z4n?OWo0?*2y{G~nemABGNQq{o5Do&M3w++jsi5WC(d3;8#Kn6PRB*I#lXW_k*X0t=+^iG z4TZr*1$mMTB{e4Ed6S|%r>B(fM7is23sH-Nr>JDa84i!&M`NZesI|etvSFEiDqNgK z-J5I~eWY?8FWVux;^DB!*s3z@C=f-;i4r?fh2IrTi?h^hl4LneyJre^F95HBcp0fA z6CYwSR`;k@$Y~?;tOO_!)by94#1+fs>TL3<4y~4hEwUk8=b&Cl*_MdM&Xie zpJs14y<|yY1=#`PneLE%)BMx!aE?^JNvDqSX0(P7;JWc9Zvmrw5NE`Yd)O5zGmbjh zb51%P+ig+g2e0%{gtn3Gz{;G8Mwsz3rkuz=H{5_C59M^2iABf5j(}CtF)lu0a_|i~Y!+C5WgGqL+48O?gmMPxd7PQf$TXWaN zQ6pL{pAJBRMAF%%4EwgR?+;Idpw4j1N|IzFoN)1`hGkgRak4;17_r_l_9?pbC|Suc zN+f?`bxRkKz7Q0nX+Z~FWHBPuG8PNx(8h4O|C?otx<->t8Txxxw-NhTVpk)EP4~xq za4215R3ofVC=x!7gTZYfoX=v4qdj;MHN{(^9uASwO&lmzke zcVO+i-bjqw={Zd@Loy`TClYRJXJ3yxBnQZZ5g8zoty`Ac^RZtHeTlM_Gzxhol(9B` z0>FPU^8WMh9^miFyEACL8Olyrj5pPAX8dRy&L)SmVNw##9X|eq6C?S5n5QV-jehpR zJm-m_O1V@xJ(ekjm2AcnREmN-SWW|ASWaV6s0~droF22$aQ+Az z3(rLBe~h(+vxk;4EJQnB^JJ`V`XQVuqc7s?>nJD=5WUn+(9GP3>K@RQ8 zKn2VV@__1~0GJ;X0Sf~SusA3IYJ)OhX)pm;9!vsG45k1l2Nm#?yr)|duBM#npfacg z({~i_`9Y;cqfR;bU}jLc!}wsDsjo5hEZ>>tJF|mX(9R9!0OtqufD3~Kz!QTLfQy4g zz@@ib1Rf%AmcYXV9wG200*?}S z3;>qlQ}S(~bQ9*VA&07s& z7yiU|;Zbdcsq#c+!nfKv6|T7_d&_2VPxim!$g+eUu*h~U<;Rq2OnHeZzY$Zy8{6s# zCH%6TXG+O?jVZTcO89C!pHkwgV3r}FgYn?O?LvCbBre66%f2}U$?n8lDs(5M#Fb`F z?@9i57@OHsyuSrMcyC)vshD3Wmj4~*_fIiDczC;%QsS!2QRxSak$!&weux0=a)xSx zseXv|33CQiKLiybM0+AbHOW*zM*AdF{THYZLE4ik75O{+{Ne_0RsGYBKCA>|3$%rS zOlfslZ{hF|%C-KJ@jjIh9a}C&YD1lz_5KtmxpBw!R%}!MGDKeJSI76fggdtNZ=Zj1 zS%32Nh+F^hL!05Tv>dtmk00JDM|)7MCt?2R$QGIqA%vWFM2m|{kq{gl`)7EIO~@c= zR5yxgIChfT^jC;*FCj!(=UiO(s4i21eO%4`GrV0V9Y_aUJBd~BaJr)=G>3RRJltv% zlEd@pKR&h#e+F%Jieq6|evOLPmp#w*?s+%J#`N#Y-Wt@BK3n8C=RJ+Kf5g}SbAYlu zhd{;82oF@bd~i}B{P1sCQHT!r=i~*ZPiYq7d+Be)Lp$i-Hy-jvHk3nQkDgs~*t-P; z&O>2OIWFu4xjofef}7##90x+x%m=xX5?u5wS_^S&9-%4cJ%JW2ZYh|hTe9~xv{3LF z4MwyfPsGmo>}Pu4YV^FNhb^`>T6(DIFT)YN=OBOo{81);;kb^Gr9YOn^kawNe4&5t zkhyc&;;>ZJ+|4Uto(nYnFK2Vk#1JWUUyhE~>?@Bk`D2kMREv=bu47_R8uAQbI)R4> zaNco|u(2p{u?7`oQdSNWxG?npY5&n<^gou~9a@9LcXjZMeOEoocZX>&o!0uR<6f4r z`0+;+iEmYv5pz6hmR|$&|HPMHff!mHtn0YhjU}arW~YD`TiBC8Xm(y9klT2>1$Zt{ z5p$57wej8vt_z$BEaJMji4?1?A*#q5)l!tD)5{qi>3+nGrs?uw0(faa^5~@M9WkqL z${)V$5#W^+I^g;h0Ln0tXY#io_wOHD!Az&j%xG50y3t7>S~nc4il*!oP58-OZhJ|( zBmad=GalebXR6aE_g)gBi$Z62Ul(suU5KLLH9qv;7@qlcce@SG!83>CG=B^Q zS6U-JDA{a_ra=!P;12s-t{K)XN2=V0w`~QU;q5QOpYY_(xqqvQ4*uJWQxZlv{) zO(FE*`w*y0?Dn?#H3)i(<~(+1k}`Uw#`1kUcwTwx!ZTk@WZpPZahtRRbZWDprq_#5 z5!8corTjO_8&DC)LcGGHSY}lTmD+{6Vt2;^WLSRAw3-3c_b+}#?8rb0lA>q_g@g3X zsrbauv|UHkp5vE;{EM};D@IW}qT^O`e0&R4T9r^Uu|R~uVJJ4%7x)~XR%9A@_ro8T zD9@vOFcfA%Q`#?(F-MEi9s#m%9P)hT9T=}MJE;cd(vBY35{`_PJpY7 z0536)+#h(Eab)+vCm2T#4}6kwWa_}D7)L%1 zyu$btGl31gg;z4UGM8?|Ex0T`p0s$ zx)iEy*K+=!D-cHocvK+hsdA%nK+gB40EwtA5~TRgPPYqQ489~YEk=0 z)wB=QlJ}qtT0}H*bt$T&@{ukAT%bz9Nr=TaHm-1bb7f=W$`$VvS~oUs-rPW_p?3<0zupO4 zR2MyZA<>&6KwDN=Y&7ux8D5plSqR)z?+Eid^j=Y@Zo97S6^ZgG0(rVOJ+^0f&(bYA z$Z&$`QC8#8rteWM$ZtA&FA(-Jfo~8<^HRpB;={6$;Ql`V(3_IVpHMP-uOIcNNSi(_ zwrMXI^ z%{lKHNq-q2R0WI+EFeq3j_grHW;L>&S+#Xkt?|yD$7v#`B-cSy zFWkh7PdIMgpdPevM`41@0bahpAbsc}ze|=TGt9h8@x9gWpGqYDjfwa#jl>CD?%re~VJFH7 zJ82t@WH~8!qio=wYNXc9vYF&Fvyoo6%GP?OoLSG7v+KEXPV&=@{Q5|FMB-MXus&KI zm3XF6TpufsNj%#aU!N#XBoj`?$vT!Z=}fH{I}-dWPa`juQ{MJ;!p_nm_3fvI8r;M^n^W$^rV#Dh38ZDG}6;} zej8HT>=~qHkh&eI?e;8Evq-pK|HoSVPS#9{~ z=bMP2?8KSYYQ0u%oNrxpnvXYHwTsBK&MIf6eg(1OQgwZ!QE#rEt*Yv}i=2@&t#c>O zu`jMyV~g=$28k26+${u70@In`RF+eYc|Cn2QMQ76Cb(yVdoH-=gZoHuFF2#NVJGaA zQ(Q4^(@wu+VgkmZ+;L~Zwm6x1nz6I;bkep!upGJ`5Nye-dYf*0o4%J?E2V154W7;W zmMf2aOF7=AYSyqGqaWj3sZZc?XAx{Ayu?}(mw_vV%Um%pCzcX7lZ!XgY6K;&bPoJasdv z3aIpELT$tCjiU>ju3Eq(RT~S9`triYRqsNpd1(H?g9~oma~{~J)-G089e3e!qp`ja zjdA0uKT>xqEA?jGz2Mk8QI*ST4Z}?8KkLo*&Tewe$6i?MB8tH`aS2h*BBF_M(NH-% z`N`x~a!KbH$eFZ_SWXH#yX;gf$3)IuVHrE2pUjzyzUg|c4c}@w%~kJ$pRO*ql&8kf z*qk90DUa&WRFqS7TTLO44rWlupIUXiXuLt0JMqB14MCp6F64iNFrWW1!fgI^(==1& zEi>8P+e3x{70pfg*-E8ZU3V%KKVPY=x9rUZkj7G_RZy1tD&~@ zsl}%$*Hda2lUyCKfHaWR$U0rlcQ#=rvt56NF`YEBrKFUrp-9&+h13aL?ne-WjQl(# zFOLwlrm9NSFIO8CufFa)S~`EhDKS=h@q*JVc}NBC z>T-@fAHBFzSFTraYv2n^qW!KFyjXQh@=hswr?)bXqI76Q`ei9oVr8pN(@`8|ry6#> zQg320Ih*m(#0TPTI8Coq*Q1kyo%cgVAKHhI8hS8VD(Kh!LY$KR7*vZ_=?uu99z-vH zT<6BhjdqP;(GS=)An;sPkrCDH40bcvgTPO(G+I^9AFEWV&1Q=$ztwar6-5!O9$+wN zo!^TGM{v1Z?D>RgWOLbMLH?$bickAiH@W8oF8AvQ!qv^SZ5v$GJD_lENOn7ITQ8Yq z%g$hRXPu1XT4EoV*ifzYjc^9*wj1yUS7_<|x?FG6y{q9s;x+l>y}eeWnD50JwO9CKoYxb$T*}R@J|C9yvzo)|wx0>;Qlp>_e~j~zv5>8{5^IU;(8xEENyJTf}w7I3}svsK_NMrY~R+^`JSR(rUdP) zsKe7pc4R?iDHsFNS0N=VBk5yL&ob24DEJ9f_8RAPI@zA?>Q7Lh zuOHzec#V!r*pIMvLwx|bdJc~<*3)c}wFavVt=CXQTE|rG>T5k#jsysQjE9|2A4EbA zJZPGeq=q&ftVA@u!g41O3~0DhYUt?2F66S|JanjU=%Eko#3;ufE>5?O=4JSWWXG(O zmE2WqcqeGFuoG^U-LR88und^iG4SFKZs7-0P&!0r97vi8nd^=QtJ@6})qDmn!{qGl zo1B=!tJ{4&q<=&1dX3yg>9CggX2LeuOWaeq8@QXV(7Z`6c2~czR~j8}PwnrMkjLBG z_Lus_$niIlbLt=(9XL?P5#?+sn4c)&lC2FI_P0#KeA`TA-|CYmy7b)mAT_MW$bPEc z^wdLaaGt@V430B6!QgQQB!qg3!95I4GwASz#3(J)g9x6%CB`=mQDek1jJJ(c{;ip8 z(JZElsYzaGDM{Yx`cY<|z~!C=hs2e2&ae+qwc0dD)7reWVT?ju!_kKl&)UXHLDY4+ zoOAL};by3?;YE(v2gQx2Ivxw&<{PFxZcm6>o`EGj;QrlwBOZFYenL!R(wD;O~BDFnt zSI)p#57~D+WA;558_aF{-r&xXS$m&7_fon%i}!Zmz5DF@@!n2*Kl-1w=^H;_&m(6S zp3`JmupdO~HUSn6``M5OsxO1UOI}Mkt{4S2bkF{hc8~W9>aa|F07mCZORbA_*5gbM zvOrwtQl)Nd{eAjWsHc%DU`9vhYwpnHbo|}Qb_CA5ST$mbAdv1?+HzY zL1a~uTPDq$tu(@vXKtmg(cDRk>0^oMlaV-t@b%nQibjN&UmHQ00!3a0&M&grTmg{M-5^CErwfuDZ z4Hxnk(@GB_rKeRriH>0Pi9spk`ZzMm8RBi5jYe6ZE0DM1z;U};aT_hq^+&rFJ}6GI zs$WFlTeVH)wp2MUmWzrQT)xbGfOI`;twWG3@;<^KA zVffuJfZ<18Kw7%4&Y>#B1*U$E0c(aIBQz18rOMnM24PFNrLw7_Bqya{{SWbVB_q1g`9yKZkD*OdD5dPwgsb%Db zet?Yi1TzLT!3Y1qhbIxVw|6xWSf2Du-g`@rv$8=g@xeb1X(^QI@2e$}bzn<;@Xt|8 zGhJiqu>5-;(o@44;)Cx-4MkA+e4hi+M?W?$%Yw^_%YtIOYsQ8;P1CGB)Ovy$0-*U9 zR3m~sn{40SHQ?ToL%X7C97z!Yts?B46A8@kF&zBXFdRwq>Wfj*XtGs$BqB3$wN) zK(g*bN24#Gd6u&W>hZ0CZ0YLnZ$NXdAgud7MmS8lSG{Zb%PcR~GL>!UQ zhT3-aizNoFN!j1{Z$3PQAZEzdyp-^D#Vl!2< zM#?v9E$obCZO7SQOf1$=jjW@hawce{mxdOu5k0`}W*k3=6vaM0*5mD5=#U+6kM1u~&5Oo&kE>@;FAM{*XGvS%9hX`ml#|!5X^h_?Jg@;V+El+p*Ow=Dy{OqICuL%1mRgc)Sv)aT;umtn*u2K zS*W88M|oGpa#b{cv|8uPy%)H_W;rf-)nITp16flWj19#hK7^-NaJf{ff z7E1xuC{9fh&0u>qTwOno@dPfH2vUe<=#4eoCr~R4Q^mN z?S&6-de!9yoKhyS&jpd?wMx|zwuEZ+m3l4O+%B#1twlc<2$q(b%M6?^4}H9~TdPic2qf?6Qof6!ZE9$W7W4Echyx%**iUi`N~mH$6`5Nmj@|Gt z^)m4xQ&6JxmW8+_h)>4LUW36~$%%7o_MZ`<=%;_{QV-Wr^y3!KcVKHF`8DGb^)RK& zTvAllO~W_lAH<0U=Tc1k2uzSaX5$1C?5>T+c3@$;2Hu)ZwC`IAmZhxO2yK}^CdXsk z$L2e)K`jc2=J2MvkHP&6_9K|ftIN3gIa0R5h7?tVY#aL-oV(z;1PxwYWx27?l?q@h zk%4|%L@|Wx5x&a`)r$uBXha-S3vi{}xEK>uEBwp8X{CuUVIL)Oe!7R- zXJ?S=Pv8=&hp-OOjBCI=*08m#6?q!w0f9CX$A*;p6khNxoS(pax~TA7zwM96&f6ldwj zNE~1+HGZ;v@M#A}`-tU0sdJZ}?o6|GwFIlObWTIXrG|6aX;3i13vuV+<$W5JDlYHP ze*HY+rH{6cji@5?Xkr01ODWY_u(=$=F-mH+j0FYls|PG{oDG&;t~A zt`H89=q9f@qkVXZ!z8pGKM%ud-(jGO;Yi88H#N4XzKD450C%oeuP!@q5O@fT=X+wg>^&;sXs z^k4LhX1G=MIRH zR@{(=ORKk|ahxJS(R$HT)1Cb5CUHTqvU48Gd%b$4zP`CmR}Fx9mn~CLo6ROTn5VdC zuQVE9>fl6|a@MOj8q!>iy#7Xg#gR49V~RkzQO%UcVCTdg`*LO(Cv>sv;b-Y4Qos3A zjBPff^I^;!Bw4&Xl{`uGJ=`D@^rFD6!S1^|YNoCaVn%@vS1HQ4BwVs$$aaQEutIc;=&DFN^Z z!uG2j(?iiPQEdhii<})le~gBwm>kRDfUN{k5hH2c>NPz$U{W$>dPJ|!0cK#LqtQaS zMV`=dOBxqbCmUAv(XiF5SrlL%t@c;|VbP(>_BGZU7MMLX?7Ef6h6Fp(ze0Le;UL<9QXM;BfJWA*e0wTdVe?dF+|mHm z?s(UxhZ0bI4b===WJ8G6yTrbZD)bT?K&l4&O6fG4XdmkDXBV3eB9QuZ);@?ba`L6? z{flTzJ%;kVa**q&t7IRGMkXa8WwNF|QbraGQ6bsP`Zf0Es|&+bbekT1dV7u7fMR(rfuZE*AcYRS0*DGFORr4ddM46mTb zTwn&W3%&hMRc(7K>^_es0@054=k+9@#PE*aBONDYhg1BxbXfg1oBtgK-x|;nd|R^K zy6k*5>5P@;4Dk@9?)CKRQLzQ}hgpQR?ugcAKD*zCR} z$694kt6@UQ&mALZa%Dq3Mo36bcp9IXN%We9F=Rd_rovZo0yYKPMu~VB#HUmzUts+d zh1iqrajjyt_t7STH1^`vZd4tsE9JpJTLXI_9-YbSu6RATjL6;f(0)sCz_$3Cv%-~lUj4+T%5{W7zj*?sCsQN9& zewV>@1~(XVfOn2G#(g43xLnmHf)q*cF+?=x4#fO*V(NmH$53}GW4vx;^FJ|W#*;=G zAE%_plfcof$@Gj>G>s|qZH=+#-z z|0hm2aiBhY=Mz6G(hA4C>O1LBjp$ern7w4W8em0hrk?281@vE-C=PFIIQOnKwNRhMVX z^dY{eV?=DBn&GUuvEFj4<@g|klZ-u8GI0fv!+n+%_=A~7x5a$0!)9x7J9&p^cJA222-1O*~>nPm6oaxp~p zo!^nsblWc?2){+)Gmg5f5q)M)hr4@0u8^f7Ah}Q1pdz8ld&LUBgBLVb*zJQvEyXZH zcp2DCog6OqMbuKIYjI9QLMs4T2fYeF==TsGh!zmB-yuy$IKPS_Um<-CBO=0i8&PNT z$Kj@EJ`m4EY>HAmTUp~LR?hgbktiVOUnj? zqW5;*_m@)G*TNY~@=pI272p|uZWr{Ao}RWTt^JVPw!}97j#pj zyr)lfiiSt&bg?iM?h?gc>%2p$weP$D##-#+wX^%KROW>((zVLubcW%sV z5Q~B9_V2hTEkrC!K8K1i_V5Kpbc9O3xvy)?$J=AwA7F^N7ZO~X zd+OV$X`p*~8V~-79qm|m`1bJa&e6N5v-$*upKejX_PHD6YLT|&LY__alGhA3>lxSN zkmOnlz7kG8wRXUk#j(gWd^0U^oIhPlU*tZTc&3)0uem52G(`KBZqCri(b4isWy zgo^wUi&cb^*t4XfZ?%@!9BjRbl>77Uu5`o|$MiSs%uYl^T%jNQj>LFN3#$B2i}{(f zjJ;1NaW$R5r2LJFCS11sGZ zj8GmPeO_3zc5_v5Aa? zk)J)L|LwzLAX~ych`^;=bbE H!tDP6aM_h| literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/test_utils.cpython-310.pyc b/vllm/__pycache__/test_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7c0512a45c3d1e12cbb4f1193129eec309ea485 GIT binary patch literal 1093 zcmZXTOK;RL5Xb#WHqAbG2#S#Cfg6!v1L9U8gj7x(DkL~XLLsc(ZB*xh?X=n|dn(tS zIUuC=$cc}Glq)B`0`$SyTdGxY>%0~+FGX5kWJ-0uFS=Vsl)6$^qD*DwPIr4bnaggs>U&@qB@dYf+}oQ(D=wIfv8XpyRH zwtwAB>h(C5)g+VKl{V`UNmeFJJ##1S?);~&5N5mL5m5vLC@U}Wm1lh`P4%n^z$Di$ z8B82bWy|^vsXh$9*W*U3#jdf;Kg9J6vX)QBPaelQH{wy9O?I-W(DAO6MQnsNX=8G! z$Mvkc?0*!}e?80Hg9uzwa(J1(B^L=9nTV)g%PR=-{3$i4!{pd);YFf!x- z&B+wasi7@8z%S9e!JM`9h_(301Ny-Em>y${J`Q1x-#Q<>o?XomaL1cb&{1IBF)LhA zNZ)x-`>E*stcJ_QZMBQah>pU2>jJ&q7+&p_k>{8ueRo!qDVTL^JEp{zUa+<}9aA;1 ziPwronmnylH4$2;S1#UN_{4z;e4j8pzzhc#%ej9Z(g9JHS=3h}7?Qq4Kfb7)g_VRZ z;_Bc|k=%5F#f=&-26rPmn#12f;U{sZUD}1lD{6yhMa3H_o?|QQ0hj;{$N+>9Td{LO U$QgL2gpm{Cg@4Em7p0@qzldNQ<^TWy literal 0 HcmV?d00001 diff --git a/vllm/__pycache__/utils.cpython-310.pyc b/vllm/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6007237037a9a3f08d6a9ff6b45b9e245ebaeeb GIT binary patch literal 9700 zcma)CTW}m#TJHPYN2AfjmT#Fjb`mAFWGBweaW0OmB;HNrlqEZrW}z9+IU}{y(>*!e zBU!_UDxxgQ14*dbg$1$!DFmuqT%LHLD2jI;sNw~x`iWv+$TL;D5Q5G3pPm`bNbN|nLiQB+>X6LvI3VG2_{#ZKO;t;)J)Yx1qz zx_le9fw$(F4a>Gv8moJ$M%qp{GIpkswX?F#@N$j3oo^KEf~>1v(JspQBld`Vm+TVW zrZ?Is+hsY%!a8I2SYzBCZ%o(|vMuFJHjdawWI2ugqxR9pls(lrW*?Jn8Si-GgndF) z_yo&ttM*BLf}g_s5k9h7!Sg7d(|8_Zxt|#JnYR>{XN8XyR^ZC&*=^l^94$pQf|ikO z&3*#45*tNr6tyQ=nNQa>Hpa$3HteUmwFx$f+SBX}c7&fp+qt^Ijdr+%Us0j(zFp9 zD^;%^r|;Cgpc-9#J~m|`PS=|+I{dlDGc`KE;14R5DAx%TxIzkLYh2$kb`{&~zAbrp zEw--lb+^VBrc-g+afkqp6K5T#5wMm=<-Fs3*s6Nn9)tU>Mr>4DQDEzDy?4vj=ia+@ zPZ6|}1m8S2+X}_37u2fWtmodDZEi*@f&c8x#S62c8}akaYHh8$%){Ar&uh%KBG(IN znww%03p|D=v;fjdTFtAn?ZDwo^uJC6Uj*nKj^!8*$aH2v`b=hly9U#_4#`WgG^ER9 zd6r>Wlq_a|>xDQq7qt9{3!!42*bF1^GPVSdTEbtb5ea)gD$_HECnU4hQ9FtQ3$&=* zQy1<*4r<9#5E3KgUR^wbL3<<-Cs8HpO3pe@q4poNURG_Fx=76QI%|YY??;1lJaS{Q zm>p$R*}~=q-b{S~SUErj$I%wPritP-z_cdL;1#RxAt?C#Zsa)U(Ju*YsYTToLL(V4 zga)lVbbhvXer-Y0Pi}lql^gHh)f7DB(KA>?JPyz<^v*2XF^Sf{r-=$)5JPbUZ$YOO zQvk8<`jLPOQN%QXa|HU*mXwIT;uOG>c)~P5S{c`hdeM}v2Sg}^5wFuw1zYz;NR{HE zF&#o;FoT=S-S(ZafR<`7M2I5zMnIz+G*tujJS`2C6 z4Ld*3Qf6ZqQ(%wDwsBtM8JI2F$6{peO}H$=V{g?X>?mfGdaYP>iXB7CC_Bzhe5}}I zc7~l~r_eGcSJm0xsu`^M2&-UKn0iGin7Bwyi7?9Y0NW(Th^eH+p#hLqp$?-xMEd#= z>uY>dipIk+>Z#iVJMT%iF{IwuBD*z5nr z`Vf&~s@qV+gbZ4&d)${8h^G(PI^{U%cbwPyyOsuam0br;+qVPqw8{?jez(7&8tcnE z5|<9yE9W@qg5hmA&Kv#x#s~K6cgN;>)obyE8tv8lQJMHNhbJrnbmZP0ZC9f}sCQ&w z&}>Eb^Z|Nb#^C)#2R5(;2{!ZSmv*qEwg>U)4|q5YihVqG)%{om&4xzfmLM(vOlypM2#da&NYH{?nHsZeRe1!W)!q)E8oRf@hU*O*k^_8hzpF@ zt1T~bBr}G1L44@T0U1KlKazS;rgul3Qnqv|^|0&ci%LgB8{Seh1&P44F~B9_f;fa# zf;4jhPe>}Ag?8puLpL74Tjcu~Lv)4^B;@)Co<^mkC5T;-uB(rhvZL+7_U?&~o+t|m zSv=tgE{Pa;%yH|E&pG2vNEIE{VevA?-^L?-8DW?o#0~;^06X$Ew2QPk{rwX$J!A(& zLV@4K=a*=j1^^O_^+~YTKV&0_3VpSatO8p?lXfJEtg1|9+9&Gwv@LZDG}_U?%DzfX z>kDrp5;VMOh+r|1t5h>+lJttVu<~KVhzomU<6-nGge|#33l6@lLdPciI1ddRLYv4# zN0cPGfI`o-0cmZ`5R*VvC)N$OhiN)kpPUe|=P@_oDR}|$9Ri1#Lh&X#7#_*NlA2bJ zK4_onBYLPL4Dr;&++jL*Rbo#Oh^|{kM;GH=IvZrVI*0;$PD{on(ijtQr_FupVHZTp z-{bS4+CsBe=p1C8voflJz6?oY(0n+l34Nfp%Wb$YzfB!?tVA%%EprKpvSqF&Y~ z@s#xw`eeFnruD3K!pNrtjZMM@6o!6%u_k*KfsU&PTv~k>s>d+QqP7yJ)|304t*^Mv z0sqyr$_})8Jx``YE{W0`> z(C^t$gygLjk4K{FhxI@-fEH|FO9cMiICs6#3`8_9M1b3+cn>=bpeo~=46sawZ=yFO zc4@HUX+3}60^_y&;wG99I5b;Ba!Hnp9*H>#0a+?fX{cLpwH+0fO4%iwrIDP5W8Xm@ zW?TJ8jr5L25zw^JzH(DuqQXkF!YhsHUANI{RB+S9^|#W1X&0N7CKu=xyvizfa8+EZ zxPIl%X2io8c^iQ7%hp<~>d5P1`xF*&k_8+rP^ne@iXTLkJG|27BES_nMVHALXHF~u zVfzC7k~2I9n>yIkk-Ire-g%vB6bbiE3zt9=YgpPvypkn~!_)T@_xRyE$Rqp)7LiUf zZy1`U8V^c(S{3A`YIufzeVNRnIfC2Xg_#8n)%G)kG>n3%>Q!*j?vXF3OWOJY<9n1t zTES|DL@~R#?urPPcA^$ePNIz!1`e?&%tbm$t=Z}h{S=5uq0VaJ1C$S-+!qZJs0ac~ zW@;58&@-n-tcQ_^jaJKLNT~o`iuDzK_t4#Q5N;MU92)jREGVuLAm4Rhu zi90mKsJHyu%#e1_QiKVSA*4$xQ^}UpwI3*T;DpO#9vAdP5B_?-MRNSZ%0QzXC@VmT*v`SAgX0q7|eoHHE*q>Nw3ck+$C{Zmb%Y*Ay>LO_D zZ3p#}OH{*6zvc!NIkn;iL35_NG_TaUA`}g6Lm?OT2uQ=aY8qDSJehXTyb?j^d4z~U z!A>H|TUw<3ma+_o{H1zRabY~ZQa@GUn6*`%DIJ}upOCO?3&@+Kd#4w~c+%)D*0B^& zi8_*E2=2+M#v|mB!L{Bj%92&Zau{1{8}es!f=M zuh1(akhD4p-`zgl0Oyh+Ggo9PD|~fkcnU7IoG`K8Y$0RBQI%VG{e9M`Bsa_Mt@abz zh)6Y_Hca88K(qS4A+JFq2twjnB^jeAMrr3M0Aup%fs(O{jQA8h$|C3=nVT09U(%gr z^=2J5$AY(6R5nO!rnPpK`QglR5G;HAYw1h~6Ql&K5XPBDaq8-u&h>@)r8v8IWA3}o zwYP3weV5W+zRzn>oP!Oafs`YRQ#Tf+TZ+@&InN_QriKHGCy8Q_-Eo_LfqlX(y|CwwQmLatypuLu|dA>z@3c9I7ib<0F;vY9K`6TpDSNbf=^#)TT!ifd3N?2 z^|IU?ENWw<_%#B1Wq0oL} zm}8%fZ^5S@#xV^Z-@xCtu1fx0RKm&zJS_cvfaiVmce#4|Fs^Qna-f{?F_vt4)WvT(@Ok_=LOFR6PB1*VeCj}U=u zYrBXzRt=_Ik!JzBzfsTG<8!yJU3G3>U%dXtJM+%9`P zj8oi)lcY4h_y}$Lk8u=P2uFq>>u%^Gt>29q!hgVE=_HP+;s+=X$pc~A$M{F6knZjv zQbNQ8IS8L%&|$P86^|HDKLYP1PDJJoa?(3HrI;n#Qb`+0LzG_4B=cB$ zS)+XRC%8W%Cd=+B6p`)ZSZ-Hk`6EaJM0wo3^(kd1%L-^KPAS_eZb76PBk+@wP;cLO zcfLI`kgul$w==Wg^{l^MtFOl(_%g1^?g!(>NQp?leVE z<6#)*<>pS%ijXtFw}2)RMX?EqN7!Ov#c9*dNMTqB8r;s&L3)F&WRjihBImg5u01M8 zB;u3YCictM1gXh^QU8d&!ZQGLcQjxH4JeBBz%a51BMq&nYN)00o)fmG66`PcE~ z6>?)FxqfgbQ=9+yn~vwMaXZ!fc9jNS=`!Qk zC3p{TIC6TQv7o2FOOt<(0L5Z*EjybW(1{v@w=rq8r;yt8xu{0m>Bf=0wf+s$D=ELvFHf1%9apyahY{|d{T5y55i+!YGu8!p3E0|#>GtgpS zwR!B7?T*Jzz42JjL=UZZt7C3u{8WU`8GHR5d<+>=wz4dRQMqNMlVRykw4MB}a!lFE zbr8})VsghQEt#vETlr;Ws}L3FUU*IU11i&^Esyj)U>-iR(VHxh}51Brp{z zFqd&Z5fu3dk`FOXV1fo8?G0XA$4wV{)*HW!Ui|*`@4u#qNBjW*U2#Y*#UE13p8>QV zyB6@oR)?)7`QQq^pmZ0adn0tU(ms1&kjysk8#X;5H5p$;SrV4PmT0cBWsoM-VYAio+Xsl<1vo3eW{Tb86hZ&Q_$BT~p^wnF9$ None: + self.block_number = block_number + self.block_size = block_size + + self.token_ids = [_BLANK_TOKEN_ID] * block_size + self.num_tokens = 0 + + def is_empty(self) -> bool: + return self.num_tokens == 0 + + def get_num_empty_slots(self) -> int: + return self.block_size - self.num_tokens + + def is_full(self) -> bool: + return self.num_tokens == self.block_size + + def append_tokens(self, token_ids: List[int]) -> None: + assert len(token_ids) <= self.get_num_empty_slots() + curr_idx = self.num_tokens + self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids + self.num_tokens += len(token_ids) + + def get_token_ids(self) -> List[int]: + return self.token_ids[:self.num_tokens] + + def get_last_token_id(self) -> int: + assert self.num_tokens > 0 + return self.token_ids[self.num_tokens - 1] + + +class PhysicalTokenBlock: + """Represents the state of a block in the KV cache.""" + + def __init__( + self, + device: Device, + block_number: int, + block_size: int, + ) -> None: + self.device = device + self.block_number = block_number + self.block_size = block_size + + self.ref_count = 0 + + def __repr__(self) -> str: + return (f'PhysicalTokenBlock(device={self.device}, ' + f'block_number={self.block_number}, ' + f'ref_count={self.ref_count})') + + +# Mapping: logical block number -> physical block. +BlockTable = List[PhysicalTokenBlock] diff --git a/vllm/config.py b/vllm/config.py new file mode 100644 index 0000000..9b63f55 --- /dev/null +++ b/vllm/config.py @@ -0,0 +1,689 @@ +from typing import Optional, Union, ClassVar +from dataclasses import dataclass +import os +from packaging.version import Version + +import torch +from transformers import PretrainedConfig + +from vllm.logger import init_logger +from vllm.transformers_utils.config import get_config +from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version + +logger = init_logger(__name__) + +_GB = 1 << 30 + + +class ModelConfig: + """Configuration for the model. + + Args: + model: Name or path of the huggingface model to use. + tokenizer: Name or path of the huggingface tokenizer to use. + tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if + available, and "slow" will always use the slow tokenizer. + trust_remote_code: Trust remote code (e.g., from HuggingFace) when + downloading the model and tokenizer. + download_dir: Directory to download and load the weights, default to the + default cache directory of huggingface. + load_format: The format of the model weights to load: + "auto" will try to load the weights in the safetensors format and + fall back to the pytorch bin format if safetensors format is + not available. + "pt" will load the weights in the pytorch bin format. + "safetensors" will load the weights in the safetensors format. + "npcache" will load the weights in pytorch format and store + a numpy cache to speed up the loading. + "dummy" will initialize the weights with random values, which is + mainly for profiling. + dtype: Data type for model weights and activations. The "auto" option + will use FP16 precision for FP32 and FP16 models, and BF16 precision + for BF16 models. + seed: Random seed for reproducibility. + revision: The specific model version to use. It can be a branch name, + a tag name, or a commit id. If unspecified, will use the default + version. + code_revision: The specific revision to use for the model code on + Hugging Face Hub. It can be a branch name, a tag name, or a + commit id. If unspecified, will use the default version. + tokenizer_revision: The specific tokenizer version to use. It can be a + branch name, a tag name, or a commit id. If unspecified, will use + the default version. + max_model_len: Maximum length of a sequence (including prompt and + output). If None, will be derived from the model. + quantization: Quantization method that was used to quantize the model + weights. If None, we assume the model weights are not quantized. + enforce_eager: Whether to enforce eager execution. If True, we will + disable CUDA graph and always execute the model in eager mode. + If False, we will use CUDA graph and eager execution in hybrid. + max_context_len_to_capture: Maximum context len covered by CUDA graphs. + When a sequence has context length larger than this, we fall back + to eager mode. + """ + + def __init__( + self, + model: str, + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + download_dir: Optional[str], + load_format: str, + dtype: Union[str, torch.dtype], + seed: int, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + enforce_eager: bool = False, + max_context_len_to_capture: Optional[int] = None, + ) -> None: + self.model = model + self.tokenizer = tokenizer + self.tokenizer_mode = tokenizer_mode + self.trust_remote_code = trust_remote_code + self.download_dir = download_dir + self.load_format = load_format + self.seed = seed + self.revision = revision + self.code_revision = code_revision + self.tokenizer_revision = tokenizer_revision + self.quantization = quantization + self.enforce_eager = True + # TODO align + # Use graph cause a runtime error, for now, do not use cuda graph + self.max_context_len_to_capture = max_context_len_to_capture + + if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": + # download model from ModelScope hub, + # lazy import so that modelscope is not required for normal use. + from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C + if not os.path.exists(model): + model_path = snapshot_download(model_id=model, + cache_dir=download_dir, + revision=revision) + else: + model_path = model + self.model = model_path + self.download_dir = model_path + self.tokenizer = model_path + + self.hf_config = get_config(self.model, trust_remote_code, revision, + code_revision) + self.dtype = _get_and_verify_dtype(self.hf_config, dtype) + self.max_model_len = _get_and_verify_max_len(self.hf_config, + max_model_len) + self._verify_load_format() + self._verify_tokenizer_mode() + self._verify_quantization() + self._verify_cuda_graph() + + def _verify_load_format(self) -> None: + load_format = self.load_format.lower() + supported_load_format = [ + "auto", "pt", "safetensors", "npcache", "dummy" + ] + rocm_not_supported_load_format = [] + if load_format not in supported_load_format: + raise ValueError( + f"Unknown load format: {self.load_format}. Must be one of " + "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.") + if is_hip() and load_format in rocm_not_supported_load_format: + rocm_supported_load_format = [ + f for f in supported_load_format + if (f not in rocm_not_supported_load_format) + ] + raise ValueError( + f"load format \'{load_format}\' is not supported in ROCm. " + f"Supported load format are " + f"{rocm_supported_load_format}") + + # TODO: Remove this check once HF updates the pt weights of Mixtral. + architectures = getattr(self.hf_config, "architectures", []) + if "MixtralForCausalLM" in architectures and load_format == "pt": + raise ValueError( + "Currently, the 'pt' format is not supported for Mixtral. " + "Please use the 'safetensors' format instead. ") + self.load_format = load_format + + def _verify_tokenizer_mode(self) -> None: + tokenizer_mode = self.tokenizer_mode.lower() + if tokenizer_mode not in ["auto", "slow"]: + raise ValueError( + f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " + "either 'auto' or 'slow'.") + self.tokenizer_mode = tokenizer_mode + + def _verify_quantization(self) -> None: + supported_quantization = ["awq", "gptq", "squeezellm", "marlin", "smoothquant"] + rocm_not_supported_quantization = ["awq", "marlin"] + if self.quantization is not None: + self.quantization = self.quantization.lower() + + # Parse quantization method from the HF model config, if available. + hf_quant_config = getattr(self.hf_config, "quantization_config", None) + if hf_quant_config is not None: + + hf_quant_method = str(hf_quant_config["quant_method"]).lower() + # If the GPTQ model is serialized in marlin format, use marlin. + if (hf_quant_method == "gptq" + and "is_marlin_format" in hf_quant_config + and hf_quant_config["is_marlin_format"]): + hf_quant_method = "marlin" + if self.quantization is None: + self.quantization = hf_quant_method + elif self.quantization != hf_quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({hf_quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization}).") + + if self.quantization is not None: + if self.quantization not in supported_quantization: + raise ValueError( + f"Unknown quantization method: {self.quantization}. Must " + f"be one of {supported_quantization}.") + if is_hip( + ) and self.quantization in rocm_not_supported_quantization: + raise ValueError( + f"{self.quantization} quantization is currently not supported " + f"in ROCm.") + if self.quantization != "marlin": + logger.warning( + f"{self.quantization} quantization is not fully " + "optimized yet. The speed can be slower than " + "non-quantized models.") + + def _verify_cuda_graph(self) -> None: + if self.max_context_len_to_capture is None: + self.max_context_len_to_capture = self.max_model_len + self.max_context_len_to_capture = min(self.max_context_len_to_capture, + self.max_model_len) + + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: + total_num_attention_heads = self.hf_config.num_attention_heads + tensor_parallel_size = parallel_config.tensor_parallel_size + if total_num_attention_heads % tensor_parallel_size != 0: + raise ValueError( + f"Total number of attention heads ({total_num_attention_heads})" + " must be divisible by tensor parallel size " + f"({tensor_parallel_size}).") + + total_num_hidden_layers = self.hf_config.num_hidden_layers + pipeline_parallel_size = parallel_config.pipeline_parallel_size + if total_num_hidden_layers % pipeline_parallel_size != 0: + raise ValueError( + f"Total number of hidden layers ({total_num_hidden_layers}) " + "must be divisible by pipeline parallel size " + f"({pipeline_parallel_size}).") + + def get_sliding_window(self) -> Optional[int]: + return getattr(self.hf_config, "sliding_window", None) + + def get_vocab_size(self) -> int: + return self.hf_config.vocab_size + + def get_hidden_size(self) -> int: + return self.hf_config.hidden_size + + def get_head_size(self) -> int: + if hasattr(self.hf_config, "head_dim"): + return self.hf_config.head_dim + # FIXME(woosuk): This may not be true for all models. + return self.hf_config.hidden_size // self.hf_config.num_attention_heads + + def get_total_num_kv_heads(self) -> int: + """Returns the total number of KV heads.""" + # For GPTBigCode & Falcon: + # NOTE: for falcon, when new_decoder_architecture is True, the + # multi_query flag is ignored and we use n_head_kv for the number of + # KV heads. + falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] + new_decoder_arch_falcon = ( + self.hf_config.model_type in falcon_model_types + and getattr(self.hf_config, "new_decoder_architecture", False)) + if not new_decoder_arch_falcon and getattr(self.hf_config, + "multi_query", False): + # Multi-query attention, only one KV head. + # Currently, tensor parallelism is not supported in this case. + return 1 + + attributes = [ + # For Falcon: + "n_head_kv", + "num_kv_heads", + # For LLaMA-2: + "num_key_value_heads", + # For ChatGLM: + "multi_query_group_num", + ] + for attr in attributes: + num_kv_heads = getattr(self.hf_config, attr, None) + if num_kv_heads is not None: + return num_kv_heads + + # For non-grouped-query attention models, the number of KV heads is + # equal to the number of attention heads. + return self.hf_config.num_attention_heads + + def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: + """Returns the number of KV heads per GPU.""" + total_num_kv_heads = self.get_total_num_kv_heads() + # If tensor parallelism is used, we divide the number of KV heads by + # the tensor parallel size. We will replicate the KV heads in the + # case where the number of KV heads is smaller than the tensor + # parallel size so each GPU has at least one KV head. + return max(1, + total_num_kv_heads // parallel_config.tensor_parallel_size) + + def get_num_layers(self, parallel_config: "ParallelConfig") -> int: + total_num_hidden_layers = self.hf_config.num_hidden_layers + return total_num_hidden_layers // parallel_config.pipeline_parallel_size + + +class CacheConfig: + """Configuration for the KV cache. + + Args: + block_size: Size of a cache block in number of tokens. + gpu_memory_utilization: Fraction of GPU memory to use for the + vLLM execution. + swap_space: Size of the CPU swap space per GPU (in GiB). + cache_dtype: Data type for kv cache storage. + """ + + def __init__( + self, + block_size: int, + gpu_memory_utilization: float, + swap_space: int, + cache_dtype: str, + sliding_window: Optional[int] = None, + ) -> None: + self.block_size = block_size + self.gpu_memory_utilization = gpu_memory_utilization + self.swap_space_bytes = swap_space * _GB + self.cache_dtype = cache_dtype + self.sliding_window = sliding_window + self._verify_args() + self._verify_cache_dtype() + + # Will be set after profiling. + self.num_gpu_blocks = None + self.num_cpu_blocks = None + + def metrics_info(self): + # convert cache_config to dict(key: str, value:str) for prometheus metrics info + return {key: str(value) for key, value in self.__dict__.items()} + + def _verify_args(self) -> None: + if self.gpu_memory_utilization > 1.0: + raise ValueError( + "GPU memory utilization must be less than 1.0. Got " + f"{self.gpu_memory_utilization}.") + + def _verify_cache_dtype(self) -> None: + if self.cache_dtype == "auto": + pass + elif self.cache_dtype == "fp8_e5m2": + nvcc_cuda_version = get_nvcc_cuda_version() + if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"): + raise ValueError( + "FP8 is not supported when cuda version is lower than 11.8." + ) + device_name = torch.cuda.get_device_name() + if "AMD" in device_name: + raise NotImplementedError( + "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") + logger.info( + "Using fp8_e5m2 data type to store kv cache. It reduces " + "the GPU memory footprint and boosts the performance. " + "But it may cause slight accuracy drop. " + "Currently we only support fp8 without scaling factors and " + "make e5m2 as a default format.") + else: + raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") + + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: + total_cpu_memory = get_cpu_memory() + # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel + # group are in the same node. However, the GPUs may span multiple nodes. + num_gpus_per_node = parallel_config.tensor_parallel_size + cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node + + msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of " + f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " + "allocated for the swap space.") + if cpu_memory_usage > 0.7 * total_cpu_memory: + raise ValueError("Too large swap space. " + msg) + elif cpu_memory_usage > 0.4 * total_cpu_memory: + logger.warning("Possibly too large swap space. " + msg) + + +class ParallelConfig: + """Configuration for the distributed execution. + + Args: + pipeline_parallel_size: Number of pipeline parallel groups. + tensor_parallel_size: Number of tensor parallel groups. + worker_use_ray: Whether to use Ray for model workers. Will be set to + True if either pipeline_parallel_size or tensor_parallel_size is + greater than 1. + max_parallel_loading_workers: Maximum number of multiple batches + when load model sequentially. To avoid RAM OOM when using tensor + parallel and large models. + disable_custom_all_reduce: Disable the custom all-reduce kernel and + fall back to NCCL. + """ + + def __init__( + self, + pipeline_parallel_size: int, + tensor_parallel_size: int, + worker_use_ray: bool, + max_parallel_loading_workers: Optional[int] = None, + disable_custom_all_reduce: bool = False, + ) -> None: + self.pipeline_parallel_size = pipeline_parallel_size + if is_neuron(): + # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly. + # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload + # to multiple NeuronCores. + self.tensor_parallel_size = 1 + self.neuron_tp_degree = tensor_parallel_size + else: + self.tensor_parallel_size = tensor_parallel_size + self.worker_use_ray = worker_use_ray + self.max_parallel_loading_workers = max_parallel_loading_workers + self.disable_custom_all_reduce = disable_custom_all_reduce + + self.world_size = pipeline_parallel_size * self.tensor_parallel_size + # Ray worker is not supported for Neuron backend. + if self.world_size > 1 and not is_neuron(): + self.worker_use_ray = True + self._verify_args() + + def _verify_args(self) -> None: + if self.pipeline_parallel_size > 1: + raise NotImplementedError( + "Pipeline parallelism is not supported yet.") + if not self.disable_custom_all_reduce and self.world_size > 1: + if is_hip(): + self.disable_custom_all_reduce = True + logger.info( + "Disabled the custom all-reduce kernel because it is not " + "supported on AMD GPUs.") + elif self.pipeline_parallel_size > 1: + self.disable_custom_all_reduce = True + logger.info( + "Disabled the custom all-reduce kernel because it is not " + "supported with pipeline parallelism.") + + # FIXME(woosuk): Fix the stability issues and re-enable the custom + # all-reduce kernel. + if not self.disable_custom_all_reduce and self.world_size > 1: + self.disable_custom_all_reduce = True + logger.info( + "Custom all-reduce kernels are temporarily disabled due to " + "stability issues. We will re-enable them once the issues are " + "resolved.") + + +class SchedulerConfig: + """Scheduler configuration. + + Args: + max_num_batched_tokens: Maximum number of tokens to be processed in + a single iteration. + max_num_seqs: Maximum number of sequences to be processed in a single + iteration. + max_model_len: Maximum length of a sequence (including prompt + and generated text). + max_paddings: Maximum number of paddings to be added to a batch. + """ + + def __init__( + self, + max_num_batched_tokens: Optional[int], + max_num_seqs: int, + max_model_len: int, + max_paddings: int, + ) -> None: + if max_num_batched_tokens is not None: + self.max_num_batched_tokens = max_num_batched_tokens + else: + # If max_model_len is too short, use 2048 as the default value for + # higher throughput. + self.max_num_batched_tokens = max(max_model_len, 2048) + self.max_num_seqs = max_num_seqs + self.max_model_len = max_model_len + self.max_paddings = max_paddings + self._verify_args() + + def _verify_args(self) -> None: + if self.max_num_batched_tokens < self.max_model_len: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({self.max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len.") + if self.max_num_batched_tokens < self.max_num_seqs: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " + "be greater than or equal to max_num_seqs " + f"({self.max_num_seqs}).") + + +class DeviceConfig: + + def __init__(self, device: str = "auto") -> None: + if device == "auto": + # Automated device type detection + if torch.cuda.is_available(): + self.device_type = "cuda" + elif is_neuron(): + self.device_type = "neuron" + else: + raise RuntimeError("No supported device detected.") + else: + # Device type is assigned explicitly + self.device_type = device + + # Some device types require processing inputs on CPU + if self.device_type in ["neuron"]: + self.device = torch.device("cpu") + else: + # Set device with device type + self.device = torch.device(self.device_type) + + @property + def is_neuron(self): + return self.device_type == "neuron" + + +@dataclass +class LoRAConfig: + max_lora_rank: int + max_loras: int + max_cpu_loras: Optional[int] = None + lora_dtype: Optional[torch.dtype] = None + lora_extra_vocab_size: int = 256 + # This is a constant. + lora_vocab_padding_size: ClassVar[int] = 256 + + def __post_init__(self): + # Keep this in sync with csrc/punica/bgmv/bgmv_config.h + possible_max_ranks = (8, 16, 32, 64) + possible_lora_extra_vocab_size = (0, 256, 512) + if self.max_lora_rank not in possible_max_ranks: + raise ValueError( + f"max_lora_rank ({self.max_lora_rank}) must be one of " + f"{possible_max_ranks}.") + if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: + raise ValueError( + f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " + f"must be one of {possible_lora_extra_vocab_size}.") + if self.max_loras < 1: + raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") + if self.max_cpu_loras is None: + self.max_cpu_loras = self.max_loras + elif self.max_cpu_loras < self.max_loras: + raise ValueError( + f"max_cpu_loras ({self.max_cpu_loras}) must be >= " + f"max_loras ({self.max_loras})") + + def verify_with_model_config(self, model_config: ModelConfig): + if self.lora_dtype in (None, "auto"): + self.lora_dtype = model_config.dtype + elif isinstance(self.lora_dtype, str): + self.lora_dtype = getattr(torch, self.lora_dtype) + if model_config.quantization is not None: + raise ValueError( + "LoRA is not supported with quantized models yet.") + + def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): + if scheduler_config.max_num_batched_tokens > 65528: + raise ValueError( + "Due to limitations of the custom LoRA CUDA kernel, " + "max_num_batched_tokens must be <= 65528 when " + "LoRA is enabled.") + + +_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.float16, + "float16": torch.float16, + "float": torch.float32, + "float32": torch.float32, + "bfloat16": torch.bfloat16, +} + +_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"] + + +def _get_and_verify_dtype( + config: PretrainedConfig, + dtype: Union[str, torch.dtype], +) -> torch.dtype: + # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct + # because config.torch_dtype can be None. + config_dtype = getattr(config, "torch_dtype", None) + if config_dtype is None: + config_dtype = torch.float32 + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + if config_dtype == torch.float32: + # Following the common practice, we use float16 for float32 + # models. + torch_dtype = torch.float16 + else: + torch_dtype = config_dtype + else: + if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {dtype}") + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + elif isinstance(dtype, torch.dtype): + torch_dtype = dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + if is_hip() and torch_dtype == torch.float32: + rocm_supported_dtypes = [ + k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() + if (k not in _ROCM_NOT_SUPPORTED_DTYPE) + ] + raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " + f"Supported dtypes are {rocm_supported_dtypes}") + + # Verify the dtype. + if torch_dtype != config_dtype: + if torch_dtype == torch.float32: + # Upcasting to float32 is allowed. + pass + elif config_dtype == torch.float32: + # Downcasting from float32 to float16 or bfloat16 is allowed. + pass + else: + # Casting between float16 and bfloat16 is allowed with a warning. + logger.warning(f"Casting {config_dtype} to {torch_dtype}.") + + return torch_dtype + + +def _get_and_verify_max_len( + hf_config: PretrainedConfig, + max_model_len: Optional[int], +) -> int: + """Get and verify the model's maximum length.""" + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Others + "model_max_length", + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + for key in possible_keys: + max_len_key = getattr(hf_config, key, None) + if max_len_key is not None: + derived_max_model_len = min(derived_max_model_len, max_len_key) + if derived_max_model_len == float("inf"): + if max_model_len is not None: + # If max_model_len is specified, we use it. + return max_model_len + + default_max_len = 2048 + logger.warning( + "The model's config.json does not contain any of the following " + "keys to determine the original maximum length of the model: " + f"{possible_keys}. Assuming the model's maximum length is " + f"{default_max_len}.") + derived_max_model_len = default_max_len + + rope_scaling = getattr(hf_config, "rope_scaling", None) + if rope_scaling is not None: + assert "factor" in rope_scaling + scaling_factor = rope_scaling["factor"] + if "type" in rope_scaling: + rope_type = rope_scaling["type"] + elif "rope_type" in rope_scaling: + rope_type = rope_scaling["rope_type"] + else: + raise ValueError( + "rope_scaling must have a 'type' or 'rope_type' key.") + + if rope_type == "yarn": + derived_max_model_len = rope_scaling[ + "original_max_position_embeddings"] + derived_max_model_len *= scaling_factor + + if max_model_len is None: + max_model_len = derived_max_model_len + elif max_model_len > derived_max_model_len: + raise ValueError( + f"User-specified max_model_len ({max_model_len}) is greater than " + f"the derived max_model_len ({max_len_key}={derived_max_model_len}" + " in model's config.json). This may lead to incorrect model " + "outputs or CUDA errors. Make sure the value is correct and " + "within the model context size.") + return int(max_model_len) diff --git a/vllm/core/__init__.py b/vllm/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/core/__pycache__/__init__.cpython-310.pyc b/vllm/core/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2fb6c10f2d8f9a8ef9c55b17c3e8299e75f017ee GIT binary patch literal 154 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hmera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_esX?Ms(yTYW?p7Ve7s&k Xw^ytM+`5H;K4ro=s)12!x4S5!v}|a)ydkm=r6On@tR?H!RVUm z>Z%r7&)s^*|j@HG6+{>{H_|K=S{`%k)={aLuVjw|>J6hfe+R>=hU5Erk>HMkJWX1*?P9;)jhtqySZMzp6?avg)I_% zWQ6-f6YgW9egt<};o;8n^-k%rSwD)FoXDdkkLN4G*w8A4@oP8Ao83Wcr`&7yo7=uD zhxeOdaubxp!H(Zo9fImNSi4rM*s-?pxwk5}CY+G9WV`lDV$2^Txk z4@a_JYmsxLKL-`MKExHQqlolKYiN;yQjbiO#$W%7hL6JggZ|sqH!rUR z?a+T?2<^e=g0%+2A4n5YthN?T-zVTGsLv)Jth8gKl zo>?1Tx;9C?xo(w8bu(=a#j5&s9c3n;cax~qGT+tLq6n)c@#T3kIg1bH=45O^i!A3(34N?1uJCuOTLzUBA!of2J$dFTDKdwjVagLuVY|EzI*R z&FbmnR~A-qNcT#P5ZiC|{6-`88japSjJj0MHyXR6W;c0~X*9&3)o4hPGByzf_THsn^xu2pPjLzBf$ce+|p{ond>wEtX1|QiglB&}r^R z@?6IiTtqROH+*Z~eyY`-vFO*;~cw}N=wbl(Dz+G7a6lg ztBJR_!$*`wlJ_D30g~OOOfq|a=G~M%+7iiw!}Z(Ma?O8m;C zqKuFVF3RnG`PQAEtGCoDSxGvV6)Ik%g1{bIK42GH%~3drjoY8b_GdTl-MkyywVR*c zyn84D-oUUwz{SCV5a7D5p4TO{AFwyUHCT}k0+O?D7~uw*DqK8@z#HgpZui8$vZAm=-M8R($vlOt?7OCAvsj9DR<&x z&}|F6t?{tk7lViL2%&~}NBX1sBy;>FZVvJg!8|0gg*MXVY6K2)oO4TX<`a$XwhYcI zsGU|lE$|jAvV^f?2d{mieXWDHfVHe>kBQ?rVJ37tS-i)1OgzWq`yB2~zR$O5mICJ= zQ{|uSbhVYD(!dI7ARL6v?u;JlGp#e)SlJ}L&{$B!k1?gZgd$#^Xe-ethtecq!=d8> zsd^G;>gI>kMlYDOrEDuCUJz6qm2`-7NoHu>A>mBgCAeQi@&5yAURjIjlR*d~xIB zdmqy^YuJ%`Pb`qB3Vw$}f2Z4N#Mw`8MU%ro;O1 zSfG6tZ0?%$*FhW5V2sZlwD#;ob7m&CK&EChUqTuniuWQjZ^Kl$Y+>vejEv6&+XGTrntWZ(70;G$eo+-YOR;+zUJ90ZsL$1dT70n9vjg}M^_6!KP4t1kt>_{_lAY7K^usD-xutb<=L7o=csmExua7DznpH zqEkRI3lKFeh9`kCZve0$A@mo4>G`tZ;U9-7?A9oB;kj8NGJi;*N?mgfwiaG8iB4tQ zz%<&v4pU?I(m`bESQMuaV-TZui0K>3RPfHJAK%8nBv{=2d98+Efh`GdzOK`g%& z^C381Vt3+AFA~Kd{Cji^C^dR!5%P>zU*Le#5p0+_i82^Iam0VZ4S^8SV5dS70k|gO zEnJ17?qeNM@<(`P$&0F$>Ts8fk-2Y2Nl$7cw;@OC=W*w-Ga#y>kHtGTXePp`kaCNT z8)iG+juBj^EF9ULoG`H`yd|htI@*aG8gW&%H81WiBPrn&2#*e>lBrn5SHc7s9@J&2Vj3-yO9^D1BN`+u2R99NZuIHVE_&; z*n|xIVv1Py8HIb^57)hJGf3Q_p2dO`+80nA=hV>#TlpU_Iq!@p#V#8T5N$lqBlLRD zJ=;Slc1=REWq1}*?z}!edk_aR^Tpn?zugt&{Vl?E`XfXT)+wm4XTc7H z44v=*JK|7+Ta1nli7IM4;5(R=wU_3sFR0-cC=h65Yc?>j z(iXL2;UsV8UubERM)DA!oEdI>{Gi~}bq??()&%erG(zn$yDzyhXO2WbS=`M;bl3nm zM`p)XfZol74mrYxCeLGVB}2J3s&_SeQPLNJ>KhglpB&Df}3Mtn&(d~tR{I2TRmT^0C( zfwS%5HCxA9=ycc79K(I?JRiSt5ZsBit5^9fH@H@vj;}0F`A*%XDMvU}pCLzohPkIc z$%I55|8X>*#`cTt3ydXqIGu2Snqf zyOEAxz;Y&$jIb9}QFz7*4$weR5Y|1u5bLnZV)BB&z&WvR?7p1uEo*%0g}R!77!~F$<=gDOteOiLhEx10Yat(AG%RlL+qviSR%AsLXQ;wE?B(B%*S9C z0OvXA0Eh7SbV7dn9IVpF_95QFBnJodIGzWXjv)zx+E5Y%5j_fnA_YbUhf@xw(>vOa zB}_opNuVMXQ<6B<7%40D7}mr|>bW5Qk(qp_qDhq$;h`&0yCVD@AD7M_wC#a$2SlBC z{Esjh1zkxc@%H=iH=r0owcgokfXhDw52!W{8=dB|(7A%yP|( z#6k3(sSp-eET!i|$=c#K6az1ykijORlWMG)_ggShW)E=+E9GxNyR<>qC}yeeLTS{Z zWya0}`sx)(T5fE&x_(nulCNEdEI78M4@&nwE!n{cY$R#9>Vw|g7X|KjK+m*hUN0p} zS{3w4ycs80!l2AcS(nd}=JG5JBNLJD1(ZSQ!|IndgoIgsJ$;XbIw`Rh^WCpy5EzKLFUCguy*rkO@KA=~U$yjNJ>_`{- z&*=bTC!fr|CCsPhPJPUnp^`s5WHl#f_6cj85RTv@%q(BUfN|w;H5@YLg)5T}>CNX+ zKs%FUDF6TUOt?_+@$bE!=@`6>JQrUM`Fy-qSv~CAjXEv1x4MI7D9H}$OCBjxof`g> zYO5$>YjZH@%6oKA8CDY5<>yrVF%<+x^#!m&wI&sWCE25*PsM-=IzeF9QXdIJx_v;! zS5!QrVj2;+MvfuhMzM}7pumxQGt{l`=D!;nzjYkrJ0s)$Pw8ywMCtW&nd4={#b@^C z4z|Cg+-a}0dfFaDyEK|@DY8^-Sme4#-}|fV z2uc{@pY;N_Vq?tzDMo3CwMbi=WXvdRcLNpd3-l`MCUB(GOyV6TckEtOA7QTn0?N@X S&DGCfXTL&$#w=Yb75@+9Kx|L| literal 0 HcmV?d00001 diff --git a/vllm/core/__pycache__/policy.cpython-310.pyc b/vllm/core/__pycache__/policy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..016e208bb2b17cf855163cf3aa1cd56538d12f85 GIT binary patch literal 1779 zcmah~OK;mo5Z+xbACe{4se{C6n-=M%V1Y<2dTd}AaAU)0p$3sJ;f0{ND}^>G%Da>U z2|CpVl1qE-wFP?dkLkIP*8QLV82;`M z@+T^bi-WQOq<4V{B4|zu8c~We%UR(>j+LF97jEQIvP}dR?ge3g4Uw?LG>lRBo$sGEUKMarvNZ0BK~BL{yN7 z2`U}IBo~f!2A++DN3MPANl)^71a|Bbc6||?@W>Yph@)w|CuN?cQ{#-v7p9@*VLU{? z##6GIs8I^|Tr89+&?b;ZkSg*6J*7Pef3;UuJHR9Q7`z~?jr#@*R!ol0kM-!v~$(${jqxG*RE5wMC>bk1IqmvjJl3U(3* zNAMq=V+uIAy%ZBaUnpb#t3bL1>=pfmydrb*E9(=5F@;pUS3hjtti}LMEdpv8BojQ! zlVVRK>uaFdELvn1o@Jp+RmFSLi%dI@EM9sYg!Yw@|YFSFZX%BaK_kWt7nt5P!qp&hGw4OuaBLK^%{gLdLOa#&J=KNse+W zjt?hEezxNd@-nFuLS|u6h`PFi4Ete$p>yQ|L)i4kzyiW>{lgsghVkZW=A-6;=JDm% z);0yROA9mJ0MefV`=3kVL3w?{w>G!>?;2GFaPV5cAh@>o5!N|$%RbqsYvjD-Ebc#)`~z5=dk+7V#Gc!+*5fZJ@gb}8H%Y-`jD8&{!5 zq!jj_SxWOnYpCq=QW!^9%J!UkfCe{J0EfwEKpKBC0pSjR%lH~q_#Pa=_sTq%X_b{D zeRf}~={Oq=?X8H~IP*F%wd_xzt~OT4*3N$9Ca*9{>btj&v&qh+7HQEITXt6Nyx;s6 DWoUCo literal 0 HcmV?d00001 diff --git a/vllm/core/__pycache__/scheduler.cpython-310.pyc b/vllm/core/__pycache__/scheduler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d79590b373cd80a0f159fa3a614496ab3246f0ba GIT binary patch literal 12463 zcma)CTWlQHdEUA1?Cgch6~&9J%j#lTE6bD>J56e+sufXoHAtUkVKR)bu5NXdjFqPU2YozB4n+ zU6OK4V*Wk%^Pm6x=f9oPD;Dhx{tkTaKbHULS|;-&W=8*XWS+(6|1W@>ag}zaqd1Df zwAxl3RsL#@hF`6%cMQkqn2y=WI@ylpSe=}c>)4Ln$vgQ@!70dodb`*uIi=2oGtntK zWm#slCp#6V(%Iwek$JN{)!FOpmFaAIx-;X<$h6g-?aVoIGM#Jh>pbE-(%J9q?;LOr zbPhTPJBOS@vd(T-JBOXaO2(V>j%+B-(dRR6-YvY7aSNWha%@9$9z{vfEuo~ep*qKr zn{dm>m63bQRj*}glRw6qGc_eLTyL%KE!M2aSYRsB7h26AvMvRlXxwNc|6UA6K$hh8LMX~X67x_rL<^?q^%3o+SZ+aJc-KExYWWU(EdOk@{Ttm6rZ+jvxsHsu@ ze7oIiUJDvQ-;XB0g7mFxtBt1jQlr~g_5^S$Ug@=4&AZPvnn6$8MM0h;UL|^dz-+7A z3hM3NGMgKbb&WOMrWX~`)IuX@M8!1syy*2;qrGF$RJt(&(Rx%(M9XDu9=96gLAtQ5?oqPanaoyE zwSzK$-nOgwJ2PLscBiqr+UhPpBU9C0Kd1&bz3L6p$glR6WNtMc(Q~VQvTnZ`^r{!G zyj<;g9S}^8*65-h!?8=JdZW?2Rc&>vcW!!JImlMmhVZJw^ZOmootHhoFkijuHG7@a zKDG#Z{7iMBX2JvDPK6~m@)j)WduDiS)Jzoyi_3l(x1&|6rE>epe zYt`%BM#rnyBfDM)DZyq)7wYx3exse#Wb1Xe*F>@Ib^D!2Z}fv+q+fgW{FNwo_2Pxg zFI{>0`o#}2;t2Lrf}fu|-S@?5ki5}8-EQ4Dy?Qse+3P+vf9A>4ek<^v0I%Kxqxz?B zx7(f5&7Sa1`ze#puih1N7^#BKw*ah+u2@z zdj;f+UWxtPvOD>X4rwys<{%#`QF(ZBFZToRf*(z#%XI5tpZYRSl^@yhw$w$x8&yE! z`i(|Fa@2#~Ew76c9&<0Ub@F4qg=X7|z~@nEtE|~uy&F|p%UzImY${WDL0@zio1|>| z$23pn1$_Q%0HG3Q>X}dlR6-3<4Rt^*GywI`1T;cj%!Y=TeIfI<5@u0i27oKp2K(la z%DKudU7QLtoA&(-rS3KPeV{H1oZw6?FCN8@I8N{w!3lyI!AXM00V1vKb)#atCmMAT z3oqZvkwh#4W$J!U1oaztWxn2WeeksId+nvqG58k4ildwhXF*EE`eEe!Spd^EmGr-$ zno2>nm4c!_SfAUzv-4@+CfAn!G0jgBwu%JEk||_*6Zln=p@64RO@ehsx)i8}wJAV+LlK$JV{dja**x#v*pbKpv5{mhO9jP8&> zLb9|sW|C)jHzJ+JpbPju;^<{{{orShUMq`zXeb^b*iT>(>;;JQ8?YZD4a_0Ppu!~h z0>KvvxE+xN*}3Y8;I1I|i%9~aFtYR`%pN5;25<wq&U0tRj0h(P`{2hupLH{2$_wkLn)i9UXedpk8rx*I2S# z)6GI(K)*m2<)AB|VWh6GUE9mcHoCMxbq9Ke8pWH4wMsIA?#3FYob;Ev#+gK}g3&7O z9*n*RxhdqP+`Y){MQ+-g@n+{TER`B++MPkooIC67bLa38H`CqcK7!Im-2F+3?jCRt zqGZ4OlzYgnB6VN{%fs#wlpK^LN4-N{1#?sq{TYvU%zYFssyV}bTDCeaTLDk3~fV zi&1XKmQmSHZI600Hopr^+Tv+07Ey7PAm=PL$&TqC)8tHfp5H=Zr}hk>Lm?EQLjjEq zN>pge6^keTp468C#ip{7m#`pVQNj`=Fm>-lP>x$j{W=*`HXsD*dy?O&N>?v<4qclUb{|JJLQaQ>$<>B((GlFZKG6FY`7zKrC^KS1X!Qh!fU+6mr*;05_RbC z*!(Rrpp6nJ8>+BEWl0$*ff_0s$^eIDg~x}7$5_%9GOwQ*_9x^>WFhbR@Dw(x2 zp|N6y>TT$SP~Xgk2A3A9n-9N1LkF@dZTae*R&Z0gU(2oA@DndB zB%UeUt?GrF9$m=T1x>xjhxL;l8hi1t<->FHe6JhO18sEgrjD!+|I(|%b?Ko5pTD)}as}~pQ=f86K>h+5YZ)`2=GNu!ETE2Ix>h?Ix zO&stM@}3vV{xf6qZCR7gRIdZqOA9Cb+K_a~N|FtL``_T6Zq2t?n~3a|-|7N4Mn*J0 z5CC2vcu2%m0-9uktSw#zh~Qn3+>u4m2Fmz|v-tIt7-9x(obvW1!tRudk1W!L-D*~R z@HnuO-&%$5+uMPwIE#ZsnV z*jo5>%~ou+pr~(I^4C&Slq3I%`b+&MBi_aQw()jYyF(YKf|)3JRxLn9LK?496l&Ya zx|=-QbF<;s`v{T6>?LXZeGK{$1zbg0Kd>c!c4@f-u~Z~V?ktZW@2frF_j{z zi+N1+c`!Xd7N8;=$2Qrk_c4kT9(A9t>+@N%&%WjBTV7eOJJ>-&_F2ZsH3%zndb|W*Od1Ae* zh$??wgjNRmFu#%u?3KK0+*1aHpb!=|6xY0^`Y*hYakCo=!U6@h+$;i@Vn}nQgn&Rl znAj}4R#^N&#$}kG^n=X%x|G!g$YFC(!kCj`sa{!YhQ{WeFfZ#c>*f^02y0KG&)&^x zmJcSviC|`B_LeH%epgwm%G?}se={%pOmw~!BjXU>yD!CV_7#j$i=u}SM<=@JQkM6DFyBI z=nyk~NJ-4|r407n9F~B#{?i!U+B_cGatsq(b1F12@$)PW4LHVzfVI~m;5sgPMTpE#`!*fxtri2?&V2hi5b70 z)ZdZykE4DTYd96ohU$G4x_ky0&l0~bHS?)3KTWG}(W8WW~V4 zeYzml67+<+@;1R0IU30xypG-H`UL)tFC=H+?9`j#e}StT#Ve@7^fs7 zf=Z`BB}Gzclwv8baKE-rB4x_Iac$t2IqnB2H8D<&K?Qt+UTzuuE_q}bS1T!x)FyWi zaQF~rAJMe`CfMlpECU>ynfsYnc@&$?N)C# zVCeO_Z5W4LFNyRIo6fJ7E_D%tPRmcMpTvy^qsui~mqD*rZTE0~7PRHD}>$U9X2*>UqF10g>E0zdoB#32#hY|5Ks3dNo#g38E zZ4&L_NS)6b=?q7ryR~B^Xj+Mrv>4=u+wq163eI}wo|0_5^b|x7tru&WW6=6>1%}z=;5xoq6&@V@}(##YRKEIe1h-11+wbfxr@LdjOF z|1&EUjgC~j!oqNc3?{|vF_me~fYD8fBl>boiJJ-BVw^ZK9WLv?3RTOHbkg-0vjmzy z({t;4uBGA2`61QE)tB3}87sl%Ao6R6f;@T^FlrGk^pz6I z^_2;HWqgweds?txwarSH<4kKsSdj|vi*WfN#U^}U>Hg;6K*#(Z$wMO_98CC)&T1RN zpuXA=jgB7`Vto@ZPUg#75QG$EbTBfc%Us(dzJ-+w8L4^{DLe*g+`U0}JAi{OeE91L z8FB3jF)N0GQ>>vmoOB#1Tv6#nKuSJmf+BQeVnHJyjs_fO>h}nEBP0HR;13C21*qAv z40(qI8vxD>2bW%ay^}nusJHQEB(`IIm+gL^KpypPvqxoTkL_pBN*8pC+Jr*i(!rGS z5b-cpa1d-jm>XbK95?yN2NxqPI|C)y@#_y zpBo+qeCvl_^U>t-c@rkhi(G02`v`|_vnp{7_YJgrs2TAd5s&R$Gh-^H2kT$lk`)sD z(LhQPn%Kqc1%bHy*vQx|EHnGC!M>Z4Zl5x8Pdfjw=3_g@T}yXPhNmE8hB0f3t3ZQ; z5sM^qz#Kxsb#tvq;_{UNtuI)flF6*0g|c<7`&<(bx3q0i4wTXE21~cG!T@D|mQu8U zgG1djCVhu<&?Xo=6Im1t&jh)USGUq+jPXAV0gN#BCW6`sYd{3MnZ6=Ug!*ghz<>b8 z$`IH-3TN9uK*flI*K2U%m+4#?5V%=+uKpB5*UW9QE0$LI5#uHnL)rMq$psT*d3YiX zFGOI)h-NI>@cad`QA)Y(1o6MS%&|Uuse8MDJ1<7hlLu$=b=f?ah(@V=fFuqxRRxIlrj{e$pA4UB zMdi_A+DB38yzk?R8cV$>L{Fr5QIhb|hWsAK+{SKiq3rKc>ROYet_ky@1hy+H*7)61 zg7Db!A^nClq+vbEay%{}R_}sh{)BwOqe8P65Iam5q?ci{7i`Kk*(@Un5ut7^Q)O2Rm?$8m!w!gug)9zmNz8BuY9K zj{lNU*47VgNwQIT54U0L_)&}uX+hCS3n=9UtWDX3a7{TT2}je0Bzyrs%O_KTXMjP4}MlI18>I6{wj=@5z8jZ@)6$^ICoo8&=?U4baJ9>2Z95p)~@^W zHd}DHv@r_?%?T+LM=wT`=^wt%7sl4<+kxsrO(J~EwZklgMC96~Y*fg`MqsYsb*MB( z&4G@bg|+9__H*rE&Gm1b=T#1#ce;ICHudCvQ=9KM5zOEtPoh-y`C@jydZq0(d=K|( zO}w5&LSAYO>)E?;y9c3w>n6YI@%+xmn#GREEB#-&M!vx-TbsZxWm`w_xDy3X~AtpgxnK0*8S7DsB)o3CcIZ~@>FKK~s6KJKAc$;Uk= z9`~q*{R{PIS*d6T_0qADR?;%Rv+u>}YDKfPimn=LTapif>`&B+{t5H;Xx{_zX!bAF zVp^x_Xlsq-p!wo{s$Tj;VpKa3<(s{B+k*?;>-w=F#m92e6^YE?t~^{k^zrH?Nlj+u zc1ZF`g7b(udA^vPPu`-)O_Te@J#=Q0I^oU7UP<~|f None: + self.device = device + self.block_size = block_size + self.num_blocks = num_blocks + + # Initialize the free blocks. + self.free_blocks: BlockTable = [] + for i in range(num_blocks): + block = PhysicalTokenBlock(device=device, + block_number=i, + block_size=block_size) + self.free_blocks.append(block) + + def allocate(self) -> PhysicalTokenBlock: + if not self.free_blocks: + raise ValueError("Out of memory! No free blocks are available.") + block = self.free_blocks.pop() + block.ref_count = 1 + return block + + def free(self, block: PhysicalTokenBlock) -> None: + if block.ref_count == 0: + raise ValueError(f"Double free! {block} is already freed.") + block.ref_count -= 1 + if block.ref_count == 0: + self.free_blocks.append(block) + + def get_num_free_blocks(self) -> int: + return len(self.free_blocks) + + +class AllocStatus(enum.Enum): + """Result for BlockSpaceManager.can_allocate + + 1. Ok: seq_group can be allocated now. + 2. Later: seq_group cannot be allocated. + The capacity of allocator is larger than seq_group required. + 3. Never: seq_group can never be allocated. + The seq_group is too large to allocated in GPU. + """ + OK = enum.auto() + LATER = enum.auto() + NEVER = enum.auto() + + +class BlockSpaceManager: + """Manages the mapping between logical and physical token blocks.""" + + def __init__( + self, + block_size: int, + num_gpu_blocks: int, + num_cpu_blocks: int, + watermark: float = 0.01, + sliding_window: Optional[int] = None, + ) -> None: + self.block_size = block_size + self.num_total_gpu_blocks = num_gpu_blocks + self.num_total_cpu_blocks = num_cpu_blocks + + self.block_sliding_window = None + if sliding_window is not None: + assert sliding_window % block_size == 0, (sliding_window, + block_size) + self.block_sliding_window = sliding_window // block_size + + self.watermark = watermark + assert watermark >= 0.0 + + self.watermark_blocks = int(watermark * num_gpu_blocks) + self.gpu_allocator = BlockAllocator(Device.GPU, block_size, + num_gpu_blocks) + self.cpu_allocator = BlockAllocator(Device.CPU, block_size, + num_cpu_blocks) + # Mapping: seq_id -> BlockTable. + self.block_tables: Dict[int, BlockTable] = {} + + def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: + # FIXME(woosuk): Here we assume that all sequences in the group share + # the same prompt. This may not be true for preempted sequences. + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] + num_required_blocks = len(seq.logical_token_blocks) + + if seq_group.prefix is not None and seq_group.prefix.allocated: + num_required_blocks -= seq_group.prefix.get_num_blocks() + + if self.block_sliding_window is not None: + num_required_blocks = min(num_required_blocks, + self.block_sliding_window) + num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() + + # Use watermark to avoid frequent cache eviction. + if (self.num_total_gpu_blocks - num_required_blocks < + self.watermark_blocks): + return AllocStatus.NEVER + if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: + return AllocStatus.OK + else: + return AllocStatus.LATER + + def allocate(self, seq_group: SequenceGroup) -> None: + # NOTE: Here we assume that all sequences in the group have the same + # prompt. + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] + + # Allocate new physical token blocks that will store the prompt tokens. + num_prompt_blocks = len(seq.logical_token_blocks) + + block_table: BlockTable = [] + prefix_block_table: BlockTable = [] + num_prefix_blocks = 0 + + prefix = seq_group.prefix + if prefix is not None and prefix.allocated: + # Prefix has already been allocated. Use the existing block table. + num_prompt_blocks -= prefix.get_num_blocks() + for block in prefix.block_table: + block.ref_count += seq_group.num_seqs() + block_table.append(block) + + for logical_idx in range(num_prompt_blocks): + if (self.block_sliding_window is not None + and logical_idx >= self.block_sliding_window): + block = block_table[logical_idx % self.block_sliding_window] + else: + block = self.gpu_allocator.allocate() + # Set the reference counts of the token blocks. + block.ref_count = seq_group.num_seqs() + block_table.append(block) + + if prefix is not None and not prefix.allocated: + # Allocate blocks for the prefix, we will compute the prefix's + # KV cache in this run. + num_prefix_blocks = prefix.get_num_blocks() + prefix_block_table = block_table[:num_prefix_blocks] + for block in prefix_block_table: + block.ref_count += 1 + prefix.set_block_table(prefix_block_table) + + # Assign the block table for each sequence. + for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): + self.block_tables[seq.seq_id] = block_table.copy() + + def can_append_slot(self, seq_group: SequenceGroup) -> bool: + # Simple heuristic: If there is at least one free block + # for each sequence, we can append. + num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() + num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) + return num_seqs <= num_free_gpu_blocks + + def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: + """Allocate a physical slot for a new token.""" + logical_blocks = seq.logical_token_blocks + block_table = self.block_tables[seq.seq_id] + + if len(block_table) < len(logical_blocks): + if (self.block_sliding_window + and len(block_table) >= self.block_sliding_window): + # reuse a block + block_table.append(block_table[len(block_table) % + self.block_sliding_window]) + else: + # The sequence has a new logical block. + # Allocate a new physical block. + block = self.gpu_allocator.allocate() + block_table.append(block) + return None + + # We want to append the token to the last physical block. + last_block = block_table[-1] + assert last_block.device == Device.GPU + if last_block.ref_count == 1: + # Not shared with other sequences. Appendable. + return None + else: + # The last block is shared with other sequences. + # Copy on Write: Allocate a new block and copy the tokens. + new_block = self.gpu_allocator.allocate() + block_table[-1] = new_block + self.gpu_allocator.free(last_block) + return last_block.block_number, new_block.block_number + + def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: + # NOTE: fork does not allocate a new physical block. + # Thus, it is always safe from OOM. + src_block_table = self.block_tables[parent_seq.seq_id] + self.block_tables[child_seq.seq_id] = src_block_table.copy() + for block in src_block_table: + block.ref_count += 1 + + def _get_physical_blocks( + self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: + # NOTE: Here, we assume that the physical blocks are only shared by + # the sequences in the same group. + blocks: Set[PhysicalTokenBlock] = set() + for seq in seq_group.get_seqs(): + if seq.is_finished(): + continue + blocks.update(self.block_tables[seq.seq_id]) + return list(blocks) + + def can_swap_in(self, seq_group: SequenceGroup) -> bool: + blocks = self._get_physical_blocks(seq_group) + num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) + num_free_blocks = self.gpu_allocator.get_num_free_blocks() + # NOTE: Conservatively, we assume that every sequence will allocate + # at least one free block right after the swap-in. + # NOTE: This should match the logic in can_append_slot(). + num_required_blocks = len(blocks) + num_swapped_seqs + return num_free_blocks - num_required_blocks >= self.watermark_blocks + + def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: + # CPU block -> GPU block. + if seq_group.prefix is not None: + # make sure to swap in the prefix first + assert seq_group.prefix.allocated and seq_group.prefix.computed + + mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + new_block_table: BlockTable = [] + block_table = self.block_tables[seq.seq_id] + if seq_group.prefix is not None: + for block in seq_group.prefix.block_table: + new_block_table.append(block) + block.ref_count += 1 + + for cpu_block in block_table: + if cpu_block in mapping: + gpu_block = mapping[cpu_block] + gpu_block.ref_count += 1 + else: + gpu_block = self.gpu_allocator.allocate() + mapping[cpu_block] = gpu_block + new_block_table.append(gpu_block) + # Free the CPU block swapped in to GPU. + self.cpu_allocator.free(cpu_block) + self.block_tables[seq.seq_id] = new_block_table + + block_number_mapping = { + cpu_block.block_number: gpu_block.block_number + for cpu_block, gpu_block in mapping.items() + } + return block_number_mapping + + def can_swap_out(self, seq_group: SequenceGroup) -> bool: + blocks = self._get_physical_blocks(seq_group) + return len(blocks) <= self.cpu_allocator.get_num_free_blocks() + + def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: + # GPU block -> CPU block. + mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + new_block_table: BlockTable = [] + block_table = self.block_tables[seq.seq_id] + + for gpu_block in block_table: + if (seq_group.prefix is not None + and gpu_block in seq_group.prefix.block_table): + # NOTE: We do not swap out the prefix blocks for now. + self.gpu_allocator.free(gpu_block) + continue + + if gpu_block in mapping: + cpu_block = mapping[gpu_block] + cpu_block.ref_count += 1 + else: + cpu_block = self.cpu_allocator.allocate() + mapping[gpu_block] = cpu_block + new_block_table.append(cpu_block) + # Free the GPU block swapped out to CPU. + self.gpu_allocator.free(gpu_block) + self.block_tables[seq.seq_id] = new_block_table + + block_number_mapping = { + gpu_block.block_number: cpu_block.block_number + for gpu_block, cpu_block in mapping.items() + } + return block_number_mapping + + def _free_block_table(self, block_table: BlockTable) -> None: + for block in set(block_table): + if block.device == Device.GPU: + self.gpu_allocator.free(block) + else: + self.cpu_allocator.free(block) + + def free(self, seq: Sequence) -> None: + if seq.seq_id not in self.block_tables: + # Already freed or haven't been scheduled yet. + return + block_table = self.block_tables[seq.seq_id] + self._free_block_table(block_table) + del self.block_tables[seq.seq_id] + + def reset(self) -> None: + for block_table in self.block_tables.values(): + self._free_block_table(block_table) + self.block_tables.clear() + + def get_block_table(self, seq: Sequence) -> List[int]: + block_table = self.block_tables[seq.seq_id] + return [block.block_number for block in block_table] + + def get_num_free_gpu_blocks(self) -> int: + return self.gpu_allocator.get_num_free_blocks() + + def get_num_free_cpu_blocks(self) -> int: + return self.cpu_allocator.get_num_free_blocks() diff --git a/vllm/core/policy.py b/vllm/core/policy.py new file mode 100644 index 0000000..2e9ebbd --- /dev/null +++ b/vllm/core/policy.py @@ -0,0 +1,47 @@ +from collections import deque +from typing import Deque + +from vllm.sequence import SequenceGroup + + +class Policy: + + def get_priority( + self, + now: float, + seq_group: SequenceGroup, + ) -> float: + raise NotImplementedError + + def sort_by_priority( + self, + now: float, + seq_groups: Deque[SequenceGroup], + ) -> Deque[SequenceGroup]: + return deque( + sorted( + seq_groups, + key=lambda seq_group: self.get_priority(now, seq_group), + reverse=True, + )) + + +class FCFS(Policy): + + def get_priority( + self, + now: float, + seq_group: SequenceGroup, + ) -> float: + return now - seq_group.metrics.arrival_time + + +class PolicyFactory: + + _POLICY_REGISTRY = { + 'fcfs': FCFS, + } + + @classmethod + def get_policy(cls, policy_name: str, **kwargs) -> Policy: + return cls._POLICY_REGISTRY[policy_name](**kwargs) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py new file mode 100644 index 0000000..5e7cc30 --- /dev/null +++ b/vllm/core/scheduler.py @@ -0,0 +1,498 @@ +from collections import deque +import enum +import time +from typing import Deque, Dict, Iterable, List, Optional, Tuple, Union, Set + +from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.core.block_manager import AllocStatus, BlockSpaceManager +from vllm.core.policy import PolicyFactory +from vllm.lora.request import LoRARequest +from vllm.logger import init_logger +from vllm.sequence import (Sequence, SequenceData, SequenceGroup, + SequenceGroupMetadata, SequenceStatus) +from vllm.prefix import PrefixPool + +logger = init_logger(__name__) + + +class PreemptionMode(enum.Enum): + """Preemption modes. + + 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory + and swap them back in when the sequences are resumed. + 2. Recomputation: Discard the blocks of the preempted sequences and + recompute them when the sequences are resumed, treating the sequences as + new prompts. + """ + SWAP = enum.auto() + RECOMPUTE = enum.auto() + + +class SchedulerOutputs: + + def __init__( + self, + scheduled_seq_groups: Iterable[SequenceGroup], + prompt_run: bool, + num_batched_tokens: int, + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ignored_seq_groups: List[SequenceGroup], + ) -> None: + self.scheduled_seq_groups = scheduled_seq_groups + self.prompt_run = prompt_run + self.num_batched_tokens = num_batched_tokens + self.blocks_to_swap_in = blocks_to_swap_in + self.blocks_to_swap_out = blocks_to_swap_out + self.blocks_to_copy = blocks_to_copy + # Swap in and swap out should never happen at the same time. + assert not (blocks_to_swap_in and blocks_to_swap_out) + self.ignored_seq_groups = ignored_seq_groups + + self.num_loras = len(self.lora_requests) + if self.num_loras > 0: + self._sort_by_lora_ids() + + def is_empty(self) -> bool: + # NOTE: We do not consider the ignored sequence groups. + return (not self.scheduled_seq_groups and not self.blocks_to_swap_in + and not self.blocks_to_swap_out and not self.blocks_to_copy) + + def _sort_by_lora_ids(self) -> bool: + self.scheduled_seq_groups = sorted( + self.scheduled_seq_groups, + key=lambda g: (g.lora_request.lora_int_id + if g.lora_request else 0, g.request_id)) + + @property + def lora_requests(self) -> Set[LoRARequest]: + return {g.lora_request for g in self.scheduled_seq_groups} + + +class Scheduler: + + def __init__( + self, + scheduler_config: SchedulerConfig, + cache_config: CacheConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.scheduler_config = scheduler_config + self.cache_config = cache_config + # Note for LoRA scheduling: the current policy is extremely + # simple and NOT fair. It can lead to starvation of some + # LoRAs. This should be improved in the future. + self.lora_config = lora_config + + self.prompt_limit = min(self.scheduler_config.max_model_len, + self.scheduler_config.max_num_batched_tokens) + + # Instantiate the scheduling policy. + self.policy = PolicyFactory.get_policy(policy_name="fcfs") + # Create the block space manager. + self.block_manager = BlockSpaceManager( + block_size=self.cache_config.block_size, + num_gpu_blocks=self.cache_config.num_gpu_blocks, + num_cpu_blocks=self.cache_config.num_cpu_blocks, + sliding_window=self.cache_config.sliding_window) + + # Create the prefix pool to cache the prefixes. + self.prefix_pool = PrefixPool(self.cache_config.block_size) + + # Sequence groups in the WAITING state. + self.waiting: Deque[SequenceGroup] = deque() + # Sequence groups in the RUNNING state. + self.running: Deque[SequenceGroup] = deque() + # Sequence groups in the SWAPPED state. + self.swapped: Deque[SequenceGroup] = deque() + + @property + def lora_enabled(self) -> bool: + return bool(self.lora_config) + + def add_seq_group(self, seq_group: SequenceGroup) -> None: + # Add sequence groups to the waiting queue. + self.waiting.append(seq_group) + + def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: + """Aborts a sequence group with the given ID. + + Check if the sequence group with the given ID + is present in any of the state queue. + If present, remove the sequence group from the state queue. + Also, if any of the sequences in the sequence group is not finished, + free the sequence with status `FINISHED_ABORTED`. + Otherwise, do nothing. + + Args: + request_id: The ID(s) of the sequence group to abort. + """ + if isinstance(request_id, str): + request_id = (request_id, ) + request_ids = set(request_id) + for state_queue in [self.waiting, self.running, self.swapped]: + aborted_groups: List[SequenceGroup] = [] + for seq_group in state_queue: + if not request_ids: + # Using 'break' here may add two extra iterations, + # but is acceptable to reduce complexity . + break + if seq_group.request_id in request_ids: + # Appending aborted group into pending list. + aborted_groups.append(seq_group) + request_ids.remove(seq_group.request_id) + for aborted_group in aborted_groups: + # Remove the sequence group from the state queue. + state_queue.remove(aborted_group) + for seq in aborted_group.get_seqs(): + if seq.is_finished(): + continue + seq.status = SequenceStatus.FINISHED_ABORTED + self.free_seq(seq) + + def has_unfinished_seqs(self) -> bool: + return self.waiting or self.running or self.swapped + + def get_num_unfinished_seq_groups(self) -> int: + return len(self.waiting) + len(self.running) + len(self.swapped) + + def _schedule(self) -> SchedulerOutputs: + # Blocks that need to be swapped or copied before model execution. + blocks_to_swap_in: Dict[int, int] = {} + blocks_to_swap_out: Dict[int, int] = {} + blocks_to_copy: Dict[int, List[int]] = {} + + # Fix the current time. + now = time.monotonic() + + # Join waiting sequences if possible. + if not self.swapped: + ignored_seq_groups: List[SequenceGroup] = [] + scheduled: List[SequenceGroup] = [] + # The total number of sequences on the fly, including the + # requests in the generation phase. + num_curr_seqs = sum(seq_group.get_max_num_running_seqs() + for seq_group in self.running) + curr_loras = set( + seq_group.lora_int_id + for seq_group in self.running) if self.lora_enabled else None + seq_lens: List[int] = [] + + # Optimization: We do not sort the waiting queue since the preempted + # sequence groups are added to the front and the new sequence groups + # are added to the back. + leftover_waiting_sequences = deque() + while self.waiting: + seq_group = self.waiting[0] + waiting_seqs = seq_group.get_seqs( + status=SequenceStatus.WAITING) + assert len(waiting_seqs) == 1, ( + "Waiting sequence group should have only one prompt " + "sequence.") + num_prompt_tokens = waiting_seqs[0].get_len() + if num_prompt_tokens > self.prompt_limit: + logger.warning( + f"Input prompt ({num_prompt_tokens} tokens) is too long" + f" and exceeds limit of {self.prompt_limit}") + for seq in waiting_seqs: + seq.status = SequenceStatus.FINISHED_IGNORED + ignored_seq_groups.append(seq_group) + self.waiting.popleft() + continue + + # If the sequence group cannot be allocated, stop. + can_allocate = self.block_manager.can_allocate(seq_group) + if can_allocate == AllocStatus.LATER: + break + elif can_allocate == AllocStatus.NEVER: + logger.warning( + f"Input prompt ({num_prompt_tokens} tokens) is too long" + f" and exceeds the capacity of block_manager") + for seq in waiting_seqs: + seq.status = SequenceStatus.FINISHED_IGNORED + ignored_seq_groups.append(seq_group) + self.waiting.popleft() + continue + + lora_int_id = 0 + if self.lora_enabled: + lora_int_id = seq_group.lora_int_id + if lora_int_id > 0 and lora_int_id not in curr_loras and len( + curr_loras) >= self.lora_config.max_loras: + # We don't have a space for another LoRA, so + # we ignore this request for now. + leftover_waiting_sequences.appendleft(seq_group) + self.waiting.popleft() + continue + + # If the number of batched tokens exceeds the limit, stop. + new_seq_lens = seq_lens + [num_prompt_tokens] + num_batched_tokens = len(new_seq_lens) * max(new_seq_lens) + if (num_batched_tokens > + self.scheduler_config.max_num_batched_tokens): + break + + # The total number of sequences in the RUNNING state should not + # exceed the maximum number of sequences. + num_new_seqs = seq_group.get_max_num_running_seqs() + if (num_curr_seqs + num_new_seqs > + self.scheduler_config.max_num_seqs): + break + + num_paddings = num_batched_tokens - sum(new_seq_lens) + if num_paddings > self.scheduler_config.max_paddings: + break + seq_lens = new_seq_lens + + if lora_int_id > 0: + curr_loras.add(lora_int_id) + self.waiting.popleft() + self._allocate(seq_group) + self.running.append(seq_group) + num_curr_seqs += num_new_seqs + scheduled.append(seq_group) + + self.waiting.extendleft(leftover_waiting_sequences) + + if scheduled or ignored_seq_groups: + scheduler_outputs = SchedulerOutputs( + scheduled_seq_groups=scheduled, + prompt_run=True, + num_batched_tokens=len(seq_lens) * + max(seq_lens) if seq_lens else 0, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ignored_seq_groups=ignored_seq_groups, + ) + return scheduler_outputs + + # NOTE(woosuk): Preemption happens only when there is no available slot + # to keep all the sequence groups in the RUNNING state. + # In this case, the policy is responsible for deciding which sequence + # groups to preempt. + self.running = self.policy.sort_by_priority(now, self.running) + + # Reserve new token slots for the running sequence groups. + running: Deque[SequenceGroup] = deque() + preempted: List[SequenceGroup] = [] + while self.running: + seq_group = self.running.popleft() + while not self.block_manager.can_append_slot(seq_group): + if self.running: + # Preempt the lowest-priority sequence groups. + victim_seq_group = self.running.pop() + self._preempt(victim_seq_group, blocks_to_swap_out) + preempted.append(victim_seq_group) + else: + # No other sequence groups can be preempted. + # Preempt the current sequence group. + self._preempt(seq_group, blocks_to_swap_out) + preempted.append(seq_group) + break + else: + # Append new slots to the sequence group. + self._append_slot(seq_group, blocks_to_copy) + running.append(seq_group) + self.running = running + + # Swap in the sequence groups in the SWAPPED state if possible. + self.swapped = self.policy.sort_by_priority(now, self.swapped) + if not preempted: + num_curr_seqs = sum(seq_group.get_max_num_running_seqs() + for seq_group in self.running) + curr_loras = set( + seq_group.lora_int_id + for seq_group in self.running) if self.lora_enabled else None + + leftover_swapped = deque() + + while self.swapped: + seq_group = self.swapped[0] + lora_int_id = 0 + if self.lora_enabled: + lora_int_id = seq_group.lora_int_id + if lora_int_id > 0 and lora_int_id not in curr_loras and len( + curr_loras) >= self.lora_config.max_loras: + # We don't have a space for another LoRA, so + # we ignore this request for now. + leftover_swapped.appendleft(seq_group) + self.swapped.popleft() + continue + + # If the sequence group cannot be swapped in, stop. + if not self.block_manager.can_swap_in(seq_group): + break + + # The total number of sequences in the RUNNING state should not + # exceed the maximum number of sequences. + num_new_seqs = seq_group.get_max_num_running_seqs() + if (num_curr_seqs + num_new_seqs > + self.scheduler_config.max_num_seqs): + break + + if lora_int_id > 0: + curr_loras.add(lora_int_id) + self.swapped.popleft() + self._swap_in(seq_group, blocks_to_swap_in) + self._append_slot(seq_group, blocks_to_copy) + num_curr_seqs += num_new_seqs + self.running.append(seq_group) + + self.swapped.extendleft(leftover_swapped) + + # Each sequence in the generation phase only takes one token slot. + # Therefore, the number of batched tokens is equal to the number of + # sequences in the RUNNING state. + num_batched_tokens = sum( + seq_group.num_seqs(status=SequenceStatus.RUNNING) + for seq_group in self.running) + + scheduler_outputs = SchedulerOutputs( + scheduled_seq_groups=self.running, + prompt_run=False, + num_batched_tokens=num_batched_tokens, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ignored_seq_groups=[], + ) + return scheduler_outputs + + def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: + # Schedule sequence groups. + # This function call changes the internal states of the scheduler + # such as self.running, self.swapped, and self.waiting. + scheduler_outputs = self._schedule() + now = time.time() + + # Create input data structures. + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + for seq_group in scheduler_outputs.scheduled_seq_groups: + seq_group.maybe_set_first_scheduled_time(now) + + seq_data: Dict[int, SequenceData] = {} + block_tables: Dict[int, List[int]] = {} + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + seq_id = seq.seq_id + seq_data[seq_id] = seq.data + block_tables[seq_id] = self.block_manager.get_block_table(seq) + + seq_group_metadata = SequenceGroupMetadata( + request_id=seq_group.request_id, + is_prompt=scheduler_outputs.prompt_run, + seq_data=seq_data, + sampling_params=seq_group.sampling_params, + block_tables=block_tables, + lora_request=seq_group.lora_request, + prefix=seq_group.prefix, + state=seq_group.state, + ) + seq_group_metadata_list.append(seq_group_metadata) + return seq_group_metadata_list, scheduler_outputs + + def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: + self.block_manager.fork(parent_seq, child_seq) + + def free_seq(self, seq: Sequence) -> None: + self.block_manager.free(seq) + + def free_finished_seq_groups(self) -> None: + self.running = deque(seq_group for seq_group in self.running + if not seq_group.is_finished()) + + def _allocate(self, seq_group: SequenceGroup) -> None: + self.block_manager.allocate(seq_group) + for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): + seq.status = SequenceStatus.RUNNING + + def _append_slot( + self, + seq_group: SequenceGroup, + blocks_to_copy: Dict[int, List[int]], + ) -> None: + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + ret = self.block_manager.append_slot(seq) + if ret is not None: + src_block, dst_block = ret + if src_block in blocks_to_copy: + blocks_to_copy[src_block].append(dst_block) + else: + blocks_to_copy[src_block] = [dst_block] + + def _preempt( + self, + seq_group: SequenceGroup, + blocks_to_swap_out: Dict[int, int], + preemption_mode: Optional[PreemptionMode] = None, + ) -> None: + # If preemption mode is not specified, we determine the mode as follows: + # We use recomputation by default since it incurs lower overhead than + # swapping. However, when the sequence group has multiple sequences + # (e.g., beam search), recomputation is not currently supported. In + # such a case, we use swapping instead. + # FIXME(woosuk): This makes our scheduling policy a bit bizarre. + # As swapped sequences are prioritized over waiting sequences, + # sequence groups with multiple sequences are implicitly prioritized + # over sequence groups with a single sequence. + # TODO(woosuk): Support recomputation for sequence groups with multiple + # sequences. This may require a more sophisticated CUDA kernel. + if preemption_mode is None: + if seq_group.get_max_num_running_seqs() == 1: + preemption_mode = PreemptionMode.RECOMPUTE + else: + preemption_mode = PreemptionMode.SWAP + if preemption_mode == PreemptionMode.RECOMPUTE: + self._preempt_by_recompute(seq_group) + elif preemption_mode == PreemptionMode.SWAP: + self._preempt_by_swap(seq_group, blocks_to_swap_out) + else: + raise AssertionError("Invalid preemption mode.") + + def _preempt_by_recompute( + self, + seq_group: SequenceGroup, + ) -> None: + seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + assert len(seqs) == 1 + for seq in seqs: + seq.status = SequenceStatus.WAITING + self.block_manager.free(seq) + # NOTE: For FCFS, we insert the preempted sequence group to the front + # of the waiting queue. + self.waiting.appendleft(seq_group) + + def _preempt_by_swap( + self, + seq_group: SequenceGroup, + blocks_to_swap_out: Dict[int, int], + ) -> None: + self._swap_out(seq_group, blocks_to_swap_out) + self.swapped.append(seq_group) + + def _swap_in( + self, + seq_group: SequenceGroup, + blocks_to_swap_in: Dict[int, int], + ) -> None: + mapping = self.block_manager.swap_in(seq_group) + blocks_to_swap_in.update(mapping) + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + seq.status = SequenceStatus.RUNNING + + def _swap_out( + self, + seq_group: SequenceGroup, + blocks_to_swap_out: Dict[int, int], + ) -> None: + if not self.block_manager.can_swap_out(seq_group): + # FIXME(woosuk): Abort the sequence group instead of aborting the + # entire engine. + raise RuntimeError( + "Aborted due to the lack of CPU swap space. Please increase " + "the swap space to avoid this error.") + mapping = self.block_manager.swap_out(seq_group) + blocks_to_swap_out.update(mapping) + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + seq.status = SequenceStatus.SWAPPED diff --git a/vllm/engine/__init__.py b/vllm/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/engine/__pycache__/__init__.cpython-310.pyc b/vllm/engine/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c07424a8e9b174200c9a17fdde0d914900e9dc6 GIT binary patch literal 156 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!H$era)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_erjHNW?rg(e0*kJW=VX! ZUP0w84x8Nkl+v73JCMP}OhAH#0RZ%5BlrLS literal 0 HcmV?d00001 diff --git a/vllm/engine/__pycache__/arg_utils.cpython-310.pyc b/vllm/engine/__pycache__/arg_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b12918e7f26ca54b4edc5bf7820d0086ac317089 GIT binary patch literal 10190 zcmc&)-E-VVa^J;bKe)>;Q6%+6NswfHtSu}>$(ChXmL*G;?K_E9EGbUl_+W5nb{8V{ zLjaaEQB?dO<*O>bs;i_buXk6eO8I|Do|D33Qh9Jy9`h1Aw({#9U>8f-xqC{MHN*Mn z>FMd|>F()4e`cnd!|&+d{d4>2wOsDssFM8^QMry3Y#|dlk@s?z`k%M*sB2!nt65r} z+6rEwt6O@vXcf7xd!=sKDt9YZr8{L!b*omjTeE82X=^&4liK?^Q52Sue%4 zn@+PMZ}xjFw;fO4l3Q*wt<~=Kh4hlH*?W%fc;2W!cOS#V(35@wP`%Ut_=YMsY4YrE z3YF_fK|{;sWRBQiX%E2~s~~k*6xvm=!YawK)DPuQ*HJH8749pcT(+iU73~#Sb$zanGZ?&)Uy$3tV30@_rt( zG#ayr`jWNGeak2xunutfAj*fVgYppOIm}}YkH#E9eZ@M$a7Vek!sTNqm#w2b|8X93 zY_v9m>&Hj+6I?e&^^;sbF{+pQ|0t{>O9gc-U{7#BsjGm!D@=9b;0!(wyY z&qJVNJ&;nwGhOEkM(x?2?8UR&eScH>_ArpP@9ccTkjn#iAU&+Z9waBp4nSkP7|LGI z_xH4}5XKQ8QRDX7w!#~HGfz`jH@tpxlaYL*b3?H04D4XwG-dmr{*iwF@lL$l9t`a+ z7V7WV!_f8I$RQOFFVh6QVb|VpLORJd^CkRP{d`+qgW5$o#rXLTTUYL?GBYWU-Q`*vLOW(~InEf!#-t6~6 z`9(#i#KUUmF7?_kgv?8Gy+J?KM(-JToT_8O1|C1ln7bY6cR9UPyS#spwof=mTC z7$skVpS>5>R{;`U-@jm`yuN&x7n&#t{VP4MT8d#l)E*T?DbrR4R7Ff>TB_Vq%d|{$ z%S@(amNDsKPRzG8u}>^~1vALA?iY(Q&(Q_=*a~w3x38SrH1r>2ZoE*(~Dx5-FRlJm$;WVf?lcD0QILA;mQ3q5?>&xO5 zZk-nAGp#1K%|JhnIk(Dfv*JQK4{bPJ?L}^%6PGfuuX5Ww^yPT`{%y94z8TzRBBm@0N41EmHe++0Vg@#S}G&+61<3Mj1H}8CC;9&iPNr}p)$wnpG z*Me5v==V2Q`-9M2yZBl+I^T1;(tr^fIAO==w~VkOjn1$QTh%6LgGU-+-+&$Xx)AGZ zMan|9oS_%SdPjPLsAifY9T{EtT^L883D&4;nyF%4kN%7Z2nMq0w%lei`d1 z9-*lGuzBfZ`U7G(hhMijT%^V&n(IjsFwM7c-kNCjYM{ z$RBUP{{w61vjh^u{8{L0OAoyMcGMq}MqLf#Bso7PCwHsm1R*$|_(z7@GMp{P^~i8S z271CcNw|_hj<@aX1f!j%p7Gd5UH9L_?EYKG;&QXo$IT(|uc8+12c6_$fr;c{u{^|8 z7;InK=#bG#?lx36L=BtX^84NF@%dVdOHPw~6@_!wG*cJNgo_q^icB_va)%@-?cz0% zyTr1&E!}n}49**=kOGo+Gp({aX8;i$-^|gCW=xZin92jJ7n#qoHVCa+;IyQ2*#fKF z8!!sEh{JAo$N6`-2oHc?4NG-`lI(;QB9>53ZcnIIv6pbe#mS{ZJ)SegO?%aX_hW3p zS0JPW2R{PNRL9uBxP%7m@Q;jk1Ebdu$9oTI24S-6KSjig$T2E|`*#e_KqjWEl7sP%kA%u;X~GxZJ|L;o9Z6 zvXPdGTu6E@U5x&g?&AiT0!BG08?-u-`7QR*gN8xElpJH&9a7{Y-Fy4q+GS7#ZO;mx z=(~OI(nUT1+{dV~5<>d!?H7Q#Nx?KCD3;AFlUb?OWmQ^(E4F;Cdog-;Yl7mt#=pz3iy4J zL|UZr{=J9MLDNiciMRu(c#BE5#He5n2GJo%{07(pWtD>AFwjLIT875ud*ln_!a;O$ zJWc_@Wl_kXfd?K|A4JC#C({mAe<0CJuAGs}YC&-qx56Q=;5bQO8dOae{)~%o5_A#W zB_VS-j?Px!6~@Om?iwF`bT_-`y(Ex)dho##b_db4Y4X*USMINnL78WhWpJ~>3@Mo= zUA?1Pa&EbTk#q+(`3@YNxOwlPL3OAGawBM*aqfNh-MTRV7ZZl#er&4yv6;CaM}JNG z-}03OB8~&xP;j(5>?}k8qJm>Lk`zOJA)AmWKUE*(YJom1jYaALja+N|e(n_MA)5C> zFyFI|-wh97c)$^v2k8 z;}+JC#=V^fxR+xodIlM&*?2vs(H7f6LItccsw;9dSK*mhfMs$E2R^Kr>u4QsuOBoIi zfUF-_`+*#WFc@M8wF~c=gf_Vp!k-AB*%b*{!i{r?|EV)Yr)0x{q^dJ8##C>R>Si*O zZ=|ztIMEYTAU7Nqgh0^6E7r+vu}VET z^!vT&xOzPqJ9jEN%{Bq9TcdtFo(31~G~N&w;5Ya+Vhl$L=cu%gvIWZaQ?`gKF1WqW zU#99I$|&^cAEE3hWlNMDr|dn-49ZSWc8ao>D63QU3T5XhYf!dI*#%^Am9{JiYEvZ9 zzliMXy#G3tDPF0v*HBUmXnHO!BA}Mw&vEC`r?kBKEB&`#q>_%5mZ^?%IRld^mr&O8 zC5E9gV1FnT5)9w#C54UqztiVNwLHk!WCa@TmW^ju6qTodewtFfFRW1+7cuzAlwVN=zM00A)9= zqZQM!3JxN3iMR?87v}ylx0`>e{mlf4<+g-R0?O_I7{_59|1u|`dR4ehMc{#yz=OXZ zfun(p4)3xxvhDmxD}%Vya-}DNx*ivrUcl4FMIt1y@WeqcVSM-$DG!M#KAAHU>{uX@ zXt|4p6;01iya-GHPVlIV0rNPgmfC|4ANPobAFR@k>Au+smG7*Ulr;9lhUrXTiDfi zi@W*VQmX_RF0AuQ9%4MK5m`5qTJ(QT8R^4%-6u}6TJ(uU{HBx9*~GHFihGIL&QNxi zvU8NN3TIvXGF4e2)7Jt@j~i5CJx)sBT7dZ3f#<4+KNrD?ZJ%CeGB)4QKl_%d97)-?xPxq2GZJGdP__j<^y^{q>lm87CfNBroc;s zf4ui-FExGX;N3M5W(5=+CW)n^R)xx)#@{?rC9i*1D(UzSg$lK-Lv`{W6nawnUa#uk zrtKv_R-QvOeyf$VZ!2ZAKaY%z&iK)(>sRB7ZIec_?YL^wOD@)e^0aNkLHClLvTcig z)3*H&L1jE=+fEOW3>B(DE8B2Ve?A*>5{g*IlYzHaG-!L3)&3ckSSolcf6ThGKo|ta z*8K#PPEp2&{WO*KNZB3%W6@wSm^Axi`Bs8`8K~lEzJ@ise2a=JFdNwF0B_$nk~b1K z#t3ovcL})08Y_XFS_k|1p6!21M3I3}hlM4aC6@)1j}lAdpAg8{zu*$7GIogwBIqEi z=5(#X!k|QgL}OecA(^bt12%0{eRvX8!lY@+u0PlF)6a8XYM(9VwQ?2M_2;uc@T^Tt zIPoLmuOkHykWGHMA~Xx%uE>%o;428tDswsgas}V3;0u$Iz}G9XCdzX9(F{J(L5?pV zrbLy$fS46U=$~3VcO%&8HOE34olUu|K5}rVy8{oiKMW?mc(@(UCT9p2NBYtrUQD0I z>2cnt3sVpV>v4_zSZcyeOu2Peu$&3ikJHBv0jvJ&sHCpI6EXoBxd;3RT;S(G4c{SY z`~aNR%V7cEGvNtXJ^W!|)pZIhDgx8MnU4>qa?u^S=Th{H9dyUXiv;cz<4-w~8;~S5 zgQy|Gt_Vg12CjVsMY_23EzuHXvM=D^`EP|4iH zfW14QxBy6K(#4G*`c&hGw-Uermk7G#4SYE0xE$e{0d%7q#G80E^ISZlNzw4%0F`(# zVQ-&KkN*K>tN`v%X?za&2oT6E(ul`Us20fmg;q;kTIDiw3moKVWsm$cvc6w^&i{tZ*|MLB(r?<3}PvLXo&;IA;I~%Fg|K`o~$HC1@_y#A9RLVYa>#agkmR9otlY9%Re zx0gHRT3N2s?R}k<+KOB|?fsqA+NxY<+6OuZYX{{z+dkAeTsz!3Qad8|x%Sb{vDz`Y z&bN8cNlkv{ex)Z9PW>JM{$1?Pb;2r zGgUeE3sP>aVuYD~Lk(Jub|oKLm%H1c{c@{02(zyYeAQTQ`=R|>E5Q95{XwhOZL~w{ zrauVNw}yQ_yS3f-!}MESJPMaC2ix7I)Eo2^S}wfSyK(u3|K89K2E2W}=lSg`z3xV9 z6SvD!{Azcz)%7o{%>Y@s*Is*FXH?8^rPXb5T&cf)0+>02Ktv8 z)Nblw%1FJF8rWm=f$^bXrv}EBxqa9`u95m4o_ML7Ddbw)&lss4Y^G6$Wj*sy>VYw~ zJ~Tb#e&4!p+}EXEh)a!)-SpUcXc#G!0>$G}6>Dwu)-BA%-N3R2cYMzscK!GJeskb^ z?d__2>rN|hgFC%p+jG1AJzu$Z8vVZCJ?*~P_8Wok_Hg4ix^627hPdmx*IR=-!}aRu zynEC4-CzL9xTKHoHr9Lhe0QU#+`%2(H3wKbx7ip5ezoF+pj9wz55kNrL#r2-t~9z$ zzuopV8-%&5?>Bwf2zm|n_f@ao*lY~^x*tEO*y=b)s!kvYtxm8RT6i{L40VhTj@DJ9 z6--sD_qugWfJ1m5oI_%p_!xO3>y$pv8YROt9K(EU;+y@zd|a~ghHc>LEA=!AFEBY5 z`6YYqlYx}Udj(u&{hVj{`3+N~k_`z} z@JinDN2yv7`3~|Smr!oWvmsUXhXvU_HwVgZbVhGq2Jw02_BPy^O&Z7=G~9@Lu^Np5 z)}iaJ`*{~1Y!C&idM=hrxw0I0RIl3`2H=P=A2WT+Q>QSywHfXvr}Hb^&HXqw#;H2^ zc-MTG!j|4l>MrRvkW27Z;G&($QQZB|HUME93 zj2jFjwQCH_2Pqi}X=p_>yak@;U=(`@S8fh^ePIKw3!ochypIe$yt$D%n(Kc9RYk&; zNVt_LHmd3Z3!~!*g?cuII^+jb+Sl;bA~7am?9OiVJ*wcidRx zxHH|sUR~wtb*zc6>h<4_`?8JEf$51QC1yI~s7U#rv)U6#eh6BV_7799fCavr+QBM} zje+&RROK;b^cV_V&s;Ah<5j(}23c z{Gg15u<^GrW?d58CVQ_^Qj}PVJfe;u2{ZM&*K0yKTG;*SBJ)?7e1{3wIm}XS`f31c zW4+gFtKa04zt7}(CKC>kD+)|SfyjLttRSHlT(MTt`Lt=5jjXj|$@3;2&3rPrdkNp* zzap6^ddR@z9;6@SmhCxS2GT6;QSzl~PR{a9czHkL6@19PBd{7h3sx6xvUDv^+4ob! zEBgg+pI?lg(IRso0Sl0oreA_wrDbNLi9gs@{Zc5J^EOGSC+K3!6);Gpc)7Jh4O#s*z-z-_zu- zDARD0Q-%8z!cPm5#8C-pv z$%M;9Bg2ww_PT=xHch>L69xV+TY@Y)w$SsW+|0%$sgvGIxEaW%1tDP+#aq;h6wOtk zz*UsmiwxMkG;xJYZ=;Nes**7}GB=hPLifafP_LuSgsZrwq+c3eq2bH87#mwKUqL|_ zyr5vDosnxPcWkO71A8|OBg6U$q{q*+$(!1BwDHRwExXztRY1+A9|UeArjDzJ;BKg1 zM=aFH3mEOY7TdPg^Qu9-5o7AEVHz^iJq?#YiyX}PNcTdyAIu!F{PzFdh4?1P05Via z`DuYtN^mg2pCj3%k(PRp+A%35C{qJhuB?ZY947gA!n@#U3TUH%Fd-&mG$+BTt^kF51nH(^1RPf_f593%f zm?N0rqN(y@uG$$SwgrR3q~``FkNfD6Ts_G-t@G z7x_r+_iMa*g~?eaw5t`}8d1X9y1&d@+D%&Q(Q1)BrgFUE4pAJL$}+jlWSz-mSIU*x z`ZRahSwI7kwtn8sX0n!H-rj`HFSpr(}`1rn)rQ+cx zYR-K2;qE1TgTFyC;|c%{@zY{_0~CSr?c~zAGyqfwPCy1uKn6}g7QZ&lbX@0fok#hs zKqTo}f#s#n;h8!`I0)1AmQZhLQXjx>>|mrf$^>}fV3YvzuE4?AkG@y^WjG0CI3EYR zgK!!4d1-9jBjIvge8>bV9i6}W{=n}BbOnI3B>CL8*moK|aXUWXM=x>bZT%-D_3 z!1ZtX3VhlLpl^NG^P4@7vR@-S95aQE1}Ot*g!}Dnchm1i@Q()Mf@-3aZ$V_;?S89*$mV34#H zG=WnB0funIfC3tj@UtUP39Gf@JrBag?d#3~Mf>ffEfRCs_kf*6LyHEg-1`7UWwcTuvNVIL3(FEF`nEJI ztK(0M_u0u!FBG5r%EhWJMU)_3?zby;cnGAb)6VMG!H^A%vr%s|zOYYEzdq>IId|Y` z;mTYV2oaVhd8El&C0onIYoTh{__CI51$C{4R_bcltw$_V7gkX|m|Sfz*+f!1SdYzJ z8D?bcCXR^5GW+7DRs1+~v{5`I!NiZuBT~`V9sMZJObXt&V8=%7)g2L~nfE>XIgcE}uJ+?jLV=Ka!;QG->pnE1-`Ysw!9V7_Ll%QwntCQ(`c^-!sQ@} z`qUB80zMHI8%nj{RMrQrjt`(z`5UeG>yQmi>Y(_;YpN(A)oomWR1Qf5a1G=M%HoeW z6Mtk3SOkPTJP@%6n+#%6lMwZ+kL`$vw8~%_le7?s>9I4;c=pE`5H`Iw+IPJ-fVn$a zJ$H1(T^|lyhRo<7ck$fsG63hDeq>KALqb)UNE0&mF>Xb7K$b}$}#8cp3thco_60U5-8{s%|$pOO5Knu*IqVB^^x2Umz` zsT1Qg{26DKBC zJ8i4qM}=A`klrhA{55x%sh`RY6Zxfn0uLt#e@T&n#{!FZQ{XS;T!w| zB!X_6=C`0LK1T%bD+l&%-ZsB*()rKLRlRfUU&c>NJOK{=xdVOTWXw;Utochbm;Zm} zS?3Fu;d9f;e`cRl?1kT%PhuGFf}*iu{1%wEX8DK>`i^e{qt1Bd^Dy&(npCzkXT1XAibX-oomv3~Fi>=$+Kd=7cDN+bLhzpVo#-6HGD?&Yk6exL zZ<@L`OkG`%rx@(oig(1{@2`r9PF$S$_;K$9@$sCk|GdBFo%C0|Qy2-#d9HuRJB{nZ z92f4-;Q9!D&*JweexJZEM_lla!SsI8dkUua@u;QeW?EsZ(=9EwaKb-{k(Iosy>n>g zl#GK^u)T`+494nC<$S|CkDSx!_YA0V!K>o#tp5aJ4!_}D^q-6AP5@G$#urzD^H~tIk_sg?JuzO$8d{hz{Nc@4w7?yB)v)ush2t5Ol;L%ZF z4Y3ZB;;t@(XPw4fpJwN@olToqj7~HFOPm&X3*cJMZ2}k@AhbS(sA*hBLcDihyDgk<@Z9xn8D#7r#B%Ze9mq%u3~f}4W1vA@Oj6t6 zbbi^P1a@@?^Q~;R{I;j1t~TwoA)J7*6VhOw=nVmDNLdxpb?uE?S1-BOqN4?v73>WR zD}elVY2wm1gESm=07M4o;WPN|?WD!qQ6oGEb15!e9^%9}%pc0xxa2LGL{KPg-JY!d3*syYO#4creg?7-pon z#Jy^OkQ^OYF5)WGT4-pNLP;OM$z$s|m>DY`2r&|dOsaGm@AKeMz3q1|xvw|gZ*_(p zw>#{tV_C4k+C+DocQ7QJUxAYfeuy$(c{wULOWbvzY}p4O8hMOpenLy~}tWh^zq(ErZvUl97|3F$Eh?-JQmuMF!sH0U0&rttQ)oPbVXJhN}FM z3l7yRJTF|>K0nWBW&bNmB0LyPO2o`9p0I*DGg*(YFh%Nc-vS-f7+NeW%`%+&XUJ6l zoXHN!hHXucl`coDWmyqjQr~Kf!#7B@rY6#^D zLkkz|N&EwPqfe$Id*eeaLU0(ZI|y>|NB~Zz00tH)<+1&E#Tq>^w>Wz?Gg^Z8Q+;E7 z3ug;&_!~28MoT`C4D_Et?NpFKiqK>C8(a%@U|?+lC&8L<4Y8J6HlCdr8y>>IJ8^Q6ofFM0x7ZS&Sj4>%=;P*M&p{p3REL2Lp*~bae zZ4BiX++Xm4C3h3Kh=7TV`6J|Ok;5IVc97Ioi3KArkm&3HU`_2HssVEA^wf5JIB4Ni zEyRDXi-rWXNkjxe9iGU=ky}`ZwPLdY05S3-)gQ2vpD~#eBtJslpKy&Rp}0m4oiLkg z1;>o56~)y`=IGemN=*yLnHAv#y^FYno)s6~ou z4;+3x1(5};daoy@`i@}M*ky5fls466sgvdYj68X}%e`qa>=x&YZH%Cxv4t%J-tfG@ z9g99+9TdlKdB6uYA)is}Ramf7dYPI!wpeI)#rEmM0UUM&imAlWm6#3{ zYiPmhi6V3AT~_}`OzKE#$vNc6a#m}6G|MJoA>u8z{R!FS3ajfKR4 zIvsh#;?112jE~e!NOl9Q2+88Jbe}L|c9QDPk@w#?IocZL2M%C~GGfVdQ;6lJPIiA&fcEw)1c1u_R_^fcj zI$Bf@nVd!vrh_(IvS>cF!*?3VnU8=lPt0BOll(WR``D_w^n?_VVwO5g!DKAlox0%()2io6!0I?%+Hphh`Js8F6yThqVoEH zL)RW;ysRj7jSAlwWC;?ANXRJlvObZI$GS1%p(h$N7&l zfD7{wCj}tLH~gj#-*R#|VwArWxf++;QRzIcW4s&pE9d9XapbMC3_}Z(kOFec=VyTF zORgNc`0UXh_5?xd*p{GM0*v53yssz=5v6XxWBelZv|OOnLl7U316@rX%-84g<9Hii zw!kwv46dOGl$#g>5t7hDRh-Q^Cv0|A^i}ezMy09(@Jfu1e;tEWmL~@K)M*l$LE32k z39k+^88Vp&fgW$EY4v`h7>#kyU(MIGGG1hWv(@!I?k&*1wtte7M0WN%`6EdE-*V4U zDnqd3Y5q{bmhfe~#CcrgKEw$i#K|rDOVhT$Fo7958J?r$y9D;*tb?*p&@ujA`U1FN zg3eE_b)NI7om-H;Q}BF~R#W_marY~bV8K1){?OD1Aa^pOQ($P5){cZj=s!7#H)SBG zOu@*r-%5>g-~h>_>s#y)@V&~tp>1d(LEZ!&;x-!Qi?;eC=k-#XzIj1JM{@ji*U zlmC96Z=F=EhFwghu17KCIM6BXY9rQ?Q-R@MQBc2rqGJsN%o;91Y?x48U*f``#w^|t z=_4>dczs&43-UyefSFT32zEnsqE?W?yiws_qp; z&H<+IB&LMS=Oh8TjZCz- z9_+B&0=-QOq>M)H7-oz_)%0L zeu7h?T7cETg*-rLuH2;|)C5)QnzKnoct38G5vcn5hn53OHX5*M|qC3Tbl>&3> z?|jlOn*pYGr-dbrPM6&Cf#*rnUw=j(=5-D$Nk|GMFv;WlMx-=6Ch z9E}|z$Co*aMfXLIPNx^#%Q%R{axs%|7bTMizr{q4j`vSN%DRO&rcTn#69g@Rk>ZwP zj59i?I6rK3ibKA>YHyQ8B09_!y4)rjEs`FtRlnDj6HB*+I5*|xhuaC1ssWKXk$Ay< z2jPZfd&EPFM7!ps+2+XY5`Q3BI#bMg^?gcGf0kQceDOv1Z2bBom|n*O_)5igzX!z= zTx|4P^}ttt4_W$Tm&9cDi?$c(V_}Ohg!3dbaxP z*K`)K1dj2r*kYlb`R3>EF}#V0h#-6Df`wwy3A%bz1OoeonCqh~|J*_)&?DjpP2%5= zN%`*LNKy7?r0n0Mt`E*l(cp!-W~LsUdAG7>V{Yj0-fS2)#`%SkVkX0j(gX{oNsOZo&qj@KL)~4pAZ!yNVAc)UZ^!a7QAP%2e!cX z2k6si{#J8XBj&Fug%dvshU=ZyfN{ZS-Nb&L;Yrv;@oPz77Fwzse1m4Ptwk|M zzEiB18z^|8*PrfjCg@Ghi8-a^{kqD{OqE~?v1FtGY7wF75!4-UyaF?8+WVFU0_DhcyLwUr$Uqixkl!Cnpen=#B zuW>03_#=*rz;`wW-~Ms*>g{Aank8rOLLdTybqYH61a2anljxG*oH1N2FXza+C}e@r z#p{uz)WLCmRBQ;`VYwaG3ONl zqIlH??{l7-GXUn>OTc1U07(oNP03kx;tsL`Z{h+kz2LuAAPxXHk8zlR0muwf$GJTq z@`%9zci1vFfj@j=jMIOLQ+~JyMu0Resl)$Y@l*Rp2F?aLT)WY8SL7IrxF=d0OeisN z1R`SwBh(@Bs`@F013GbyYdMCqp`!=EBqCkO)DA=;w8+5G)C&}kVQKm@j$2!Wi0A1~ zIcyMZhXCiIrgA4uS2Cg1YzOME*v22TdK?#zTW92^wk%#0{VaV1TgO4?c1L;M68keQ z9r=pT5f9ZN@&?Jo7tL_E3@{Ue#gt;%Pwni&vd*{m^{jgyT|}(AJj1$k06pWTlZg8? zPZqtNkV_LrAiBNy@vqU*f1Kh+gsS!kOU<E`Yyl@1uD0>QT73-6L4G;-jKlsMetqm zl%amB=AZCti3zoeR%JzAtupybCjXv^2*eVvxFZqDWrR-)XGM5j{U;`*&eRST?l{K0 ze~RQf;m1#Kkxv2cKZ+ylSa&(ruDL-DDVlP$-OS`aH4owdu${NeCA>fNxU_%eRMx>u zcB1iqjRTA?i&*|tUiaZEJ_`pO^S@kauB?=bW(IsHuLcQEn#He52V}gJXY@< ztCAICanZSW%{qrrMFZdenpaer_;*e?LL`BD%}R{s%Arvq=3ZP8W}k-=z>6?1QVmI% qEt|^X?8vi55hro^SC+Jh7wO7Y*2a?JKiGg7R!VNUnfk%X@&6AOCwY(n literal 0 HcmV?d00001 diff --git a/vllm/engine/__pycache__/llm_engine.cpython-310.pyc b/vllm/engine/__pycache__/llm_engine.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7abf8bd2d1421c106cfc814aa53fbf2bfae603b GIT binary patch literal 28319 zcmbV#d6*o>b>B=+&pkUkdteV7L<8V~A%O)*N+Pu)C_=ah5nRGpkhEH&2ZQb2#VqDn zRL=s$X4awwI)vg-k>VsyA}L_TJ69%>ZrVq@4IZj|g&W5gbjdXn|g#+W@O@l<`hvCZDrn6M`rllEj|%ARUW+tZEh z_ICN6uJ35zzyihMU?ziuk_)z_U#sT|)#EbQVjTw8UamYT@IBXwoJZL}Ic*uUJ@v!}{ zd>^jAukn8S{f!UUACUJ_{gK9__M?pt+8=Dp+Ov%h*&k{=Wx%aBSk8v2OA89;p zKaTgD`fBRWY<}i++f~(zb=Nn~)I7XDv(m1$n$^0WJioevFB^Wg``IsbqtC3iS615|fXJO_F4dauQMKeDb8fy}ZJ%i^ExF21GVYZV z{?NJV_2*mava6n}*Bkz5ty$wJ)Yja}LVeXkcm0H_UT2T^sZy8ncpVBlr&~ipalYDE zsn?oI>_Wpsr-vlPRXU5Gndfjc7hHS@Vy9GVb;Td+dku0I!^FXNLcYrWOgV15b=hqa z)>?Bxxed43#^7D|i|2!MB@L95e)?EzwaLy*d2YM1uVmW^O$C#HQJ8P5YTI4n z$QAqZe2vs%ZS*R}d*+%9eekn39ZhSxtE$zk=p5*-NvQ3VPPV6?J9Fmg%Cqw)D(8-V zta9v`r_Y`~bK-dA_|a1fMDXa}C|;hx<9)%1#oU-3cVf2T#O;J*xTceElaA@8oTQs} zQf|gcyID6k9dj~n*2&^G=jNTfQ&=2zhMeNdrd@D`-JwOpDLEr}D>|dj*vm0{n2Qj< zC1=8!#P5i6L6)9rk2+OnyR!p%#}M1;>_TkZ-R9ioPN3e{qUr2*td~tJ*U4-lTW~Hq zd)z5!FQAx0>-+FK?QWlrA$~XFJEmic8Rs77-j|d1PUe>~3w>qob9eQX+mCX0^~yVC zyrV|xRmr*Ec>s0oMh%ub<)Ur0U<6-H=YVq%@a}c;u8BU+IEV1I4{zJ?cG!6kZ+GJ@ zkGF@Mhw*lg^FGv*cc-KdLh^oh$oT*s=E^ybIFBONz0L>S`d#W7gFBu zoO8DaDf7;Gq$|KJ6?7#Ef&d2cfsAD??=i^^6nCg52*z`g;{_Y3S z()T!(ms9qvQvh*x{4B6q6RqkrmdVkkg#^nLGG?_eRoj+w7u?!a*F!{zt7lc4j+>ec4bYIO(Gb*au<0rjy!^jZt(y(3+#kU6$)>F?}3*3gvGkRq7XsottB=?@wSL6Q1E$v|4y$bj|z5RfmN$c+7Vr_vV5x{^^ zZK5+Iyyz<4UaU1x7e`lv=c=nL+r9)szjO)PQg>!t>jK8>LNF!QRCR^(gvsj7QEPD# z`-y8^thNb;RckJ~iW`W)61H*SsCT`&fVwo^ymO6BsnFA41+dILbN<9EM#QxiWPi4* zYH5{S!UQVUa$E&`!_X2BIL#M&yK2zbjCE3@lyrKb)mTyPCAaCJoK?rxVYL=}l^(D> zx4)7A4sO%4QUh^U@S~z3ibA)l=LhItml?;f6DXpx6q)JPAaniRYY4F;ie3bSf4yn$ zRS$kiIF!f<6!-VDC(+SZF)<8?^Wsiz49yJo?clZ zgEDJ@NwT_$mL$qjdzW5u=*TA3RIoJPB|n&}3Ky(5YdwDn9EJ-z)v}N(%NU4MTL5We zt$rbjAATV^*Z%Op)cB>r;q{05hRDy?TdEq27mD|HUmXDR;AgwQ`6=bLS5; z9|fcoo@02tpN1IJj%~!9_=d3rG3aygPJB7>dHgyFiJKBPC7%3z4CzUUrzD<|cv|9V ziDx99k$6_(S&8Q)o|AZ9>dZ^LAn66f6U##qACh=c;zh|bEb(E9mn2?7+*lrw_z2?3 zZ{H+raR-0vE05+nr${w z_r~2pUH3>`_o6+cle)pS??a8-)vvYpx6AGOIpWUbt8r)QRl@iHzCC~$nZ_^Lq4VQpGl0jau%Qvd+BWshW5VW|ZD&1w_VUIepwi6|59;G=51YWzE+vPj_VmB(2;Ag_2_D6ePx@jYu z#vXbK4!l3aQP<#M-2Fmy&~4BU48I<6e|)Rqwx{nn)^;H}(DqQ@DEs-y_(E4A!kTYl z3w6|CjDrk@dOu?iF?g84`xrcez%Nv;V*6gaUcsL3`NLXx>-JmS%RG7F(n25~7PY)p zS@qls&gV&{p}r(PuU`dlHO3~gT>`8o_`aiZ=9yzh z&mcT?3Mz)m>ACYK&OL{O`4h){lNt#?g^eH(x9vQJ_F7c|!>ddF_*PP%J%TPYR%&%f zsDPUp%Q=oyeK9gl14AUk_5lDSgJow~aMc5HzZO%UK=K<$53PFYkf7nAdhOz&mFp0D zn-9-Cc=(V9R`DRX&&$Z}9irfRNVDmO5UuFw%*u5?Q_-VQc^VaZpF&{fb8$1CxosMm zx6MT6Ez`_=$4qA4G*g*3%yj14W+wA3Gn@IQnag~`%xAuC7BYWo4rN|9iqqX%lY1 zJRMt)w-Xy?2V2;>fz2+tp6DdzW9kAjL1nY!_=)FEA3ISw|4iliXU;u!;#}qFXO2I6 z=0xS`qi1DV*-)ENc~udjkgYYSBmrvpLpn+l0d6yR4A+4|4qwGI<3A24>y7oih-|*S7ZtX0X7fF=1hu$-j&Uw!PRbjE5c> zCw|%RiXR2DawGn!_!76nMB8j9Z^YWE8?hS(MvFPq%#8;d`*PMvI0-TU%Q-yx<$_~6 z$yak2(b#fgW2lo5HX*iLbW-?|eifT8z7(B|gKZqMG3;btO>iT|hMSxF8Ygh=A+nl~ zf`~K1Hr5uc?!Kj^0cwltPKBG8^|{6-2dm zP?1+IsU>_C8g1u#9q`i{0X%g-KksKyW+1e^N6dv%aoDrrL_WmcHcg$5_;<|o+GGSfWiW&O_Ms-1Y6j0!c)U3T?O2;g22>;}Et}-kmXpii zG7%C5lgk+={%VR#cq7Zj46c$$m|V^;#%r;T@s;>zP__aM8<5MGE2r$d+Z0}hBnjdJ zhN`*RAeHf8P{3QQ3Gtjf)e@w2E)sQ;!EprTg8FX!_~yk{tFAX&nVJ`wL1wC#{{%Cp zR#zNIr|KSNAaw+hB<-`e^=O=o#%1hkOP;ND%lc)23@Gj!zlg`%iGZ^pBd+QG8@aTm z1xCDZdu@7fthj?+nD&`WyB+0XKuug$6h%331dCEp6aAFvX0>-ZW-RaS9V|);#l5E5Q4;dx=o@`0+C2`pk4g} zhy~}RJ#-_!Q9w(@a8y<@TtPCJAG$tYLXf*9&NK>v^Vv0A*ihuan1PH@FcL+T-nQH{$C6QEC`ZX?>(K)G1BJ&^}-WAuGz;L}x^xd9_pAn7kFksN{6+_EcvC*o$0`#jgBF zHrAd-9D3y&#MJF<8!Z^fUqt?llo`bsOm7r)%~I>nws&;Q_D+)DPa8K=8@q1BJ{@b{ zwXqxJi?`yRP6Tm`+NaGRZXuow;(HK}x6{4$q)&U%+E}Dj-LJi!F^t4GVAuy3#yaDR z#w)Q`;;$I5BwjIJNxqU=OglLz|8iC!`z9c`8+uLC86sJix1z%tiY?#c40p-;$eg+a zRI?M%+XZwwp~!LB`v#EJHrhdOUYaT?n*aAjHiuOHd6et%|}@>g%I;WNOHy%;WLM5_7|SA&t#APWI|n z>~qEy18fOtGi)E@+<90_1v=0Qx>>0?u-Ir35#NoO!|EEc4$uxi_ZZc0WQ-q&+^u=3 ziP+kqK|&D_Wpd6lkI63TJ&s6NwvT)ebdnD;_MU~{X6$C-rg<}Y6To+3H-V_jL&OmL*uR^BFI+~tsv&qStv4up8laKUf83!`-&{NR9=oh-Qw=uVWU#{E3Imyod$pK#nhD&G{Rz&7)<{>hDp}-Z zSbH=l;jA_q*Mm)1c`U|F-p{%~T~XQFg#f9&6?ef?=a}UI2HP2YFM}xtvd?f!vh!VR zsI90rQvBR=)%vP?LaCNg>&(na@=Z>lpMpVvyh0KJO9My)O#n3I>OSU3tf2^*wy|5^4Qs6P--i7 zHeAY%ozhX&VO>OHJF9DlIT98&J&uC?HK+R?ECPEn$U3+>?1H2Om~Cg}O$bE1W~am2 z(&1W)S9(aOSJC$o8X}qEBK>td9@(OT0qM<1f%i@sqmZ(CxR!b=mDF$MTZUyrJAEPV9MRD1Bv z+<;zh8T*Z}^5zCPGSSS1nlp6)Rh2V>t@e&U`9Nh^k1^F;yk19KLP8Ggsc2rUXVDq$ zozX8{MFN>@I1>nZz82TGCv>4p9Ml<9-XdKnAyUNXrsNUX`|Y)f$Vde(kQeJ4B$(iY zF_l63Pk3SHow!4p2lP!sN;~moEzu;M1enq|wtGrE$(NxdC5r*p!axd)%#oBtJG}vO zbI15RjLn_I=VMjRvN1tR#yFTS#Bacs;YgKfl|65Db{K%Gac7k~$6RcjH-lfYIe1~yAnA!2JH zud2NwbcWDY5?F35{GEqMLTf#&YqfgaTC6D;elST;Ct}C7R-^&$kT3?(e&N1Ig+AUj z@zkyLY!IlgdBCcnifVmfm9`!!1x~%zSZ!E=-O#l_MlWKIMx`_j(nNR+zZbEIUDhT& z0wnLGaRIp8HfO?=j-4ZicyIQNmJf~|}guaXxIRe$S zIL%VJoW1b9Q8xxnfM%Z8jt+VhToFC0S5cveBpTWaXqa9V-Rc6%D^0euVig>yTGx3Y zG6_c)A#_kt=D}JjQw7KyV13Z+GH_$R!~8= zJZv;jFoSo)V90d#?M0csG#}$9aS-b`Ugp2{S0F2ExmpLd-8>JVc<;Gpyf< z^ZFg@Li8p%X46UPZ?cBJg#gZ>=pP22TAM*MdKMY3aq}lbYEB5Vq?IP|)Eh>2bO_8F z*s+W-U}UqPLLuArHrTFoCSHQ-d~Ms7IPXJ~M7Kh*sq~3PDK>@Y7#{CY1f963SfQnI z;!r&&pw&u%8NfykdC&M#yaNmHIjCHzPA|l2(2cAoJ4vk7#M6z?&vDpCQCG$kn*@eAbsHQrtQ}Ep^-x8=%yys;GE@j9K`{I3F2x3Nsz(t)Mj;!>Gqr<6iEgF=3;?06{=CIwr(YXt=2M zr)Dsr$kwYLX27~Zp+15i$W2Y!UIPq_WZDRG4h{rHjwK-Yh-cQXau{eb$)v%+i=ZGw zFxA7V{sR867r2lLvkYOaP*sqAU;}N}8m^y%Tg_q(D&Cg2Ky_*OH-tzUj|{+1BhpDZ z@zuB*Cgf~>v)(fm*qtqz~Kgw+?Rjs3QM;*tic70p|Cm)S}Ah^K@g9~ic zBRf55a`*`(xav5X{~-Mh(^*8=MB!4w7XiT*T+c&J?C{9hWiz^>23e^Ky6qWw1h4Eb zUm%4C_W^4oXftYeU#x=5?P;C>S-rMEOGdXkc%oN&q-; zU%NrAHfumH>-6!Ekq$ES_*L>7dfDg_Iu0KigxwM2-U5rl%$oRz zJXfI#bNbsDz*gN@m>A)Lw8J6P5TTo5pX*alKNfor23?hGkm#rv_EdukAHzk9&>yg_ z^=3=pL;%CJ?6+OrHyKdSKZtmm&(oD)--Y8C-K5z-f4`!OPD2u;fRAjrAu!>vN1t@0%dEv!Rv1}a$6b-(`q$S%Yu<%`8X>fT8j(qpS;tC2zp?B&C;h$BHEdc(U82QJ z!3oBmEAGzq=sn-Ma1o~_Dy_u}5wWIU0-O(Q^%tzx#btMa9={>F@%Nw=)Kndi9K&+r ztREKcD{A!|Jww;cJ2XAmBuO@Bf9vdhErui9C9#o7I`JYmLAQUB9)J9C@cmtGJXAIG zLC!pyN?S+BGMx}EZDx?o*st>l_g3Ci!9FnZJ&f*hSDW&I;1kM?1_xVdLqDQcp zXgPpWaG>kJsTg`E^XP`_eR%Jt+Crik?E9#0zV9%}eyDD-RAhl1IXrWCGu*vh0)-C` zHZb^(vBc&mu!f7bi>+jBH{}m@KNJeNoBf;e0PnEH?C(KR?x`4vl$;;@^^e?^S>C1d zn|r^H6duYX?28=#uuqsAZtWlmn}D5}>7no3+Q=yo+o-RKwKi?xxQoTpMx0d=V{Jcn z_Q0p3$1!ez-4eD@cmsu$)t1eJrWOsRPJrY zGSFe0ZySd3b|#feP|IC1qk>kX=04df)N=@vONA~NM{z{zGLA;|ZGdw0BJ3MHdJrJN zey`lynDq8}`gnxgygS2n`Z((dR&t11ST0qz6shSByURHjUi$&T>qA4!>eCl4K=hyK z5ybiM;M*>K6co(lk@aan-7uM1pHCAA8ZO0(wiut_i)-b^^6yVw@h zFuhDYr1aEQy^ynm14Y7mKMKfRCv-Vu?VjFly+tEoi97a9{Wz+@aL=3*zkCncGHe83 zv@r5eO7jp)59ZAs8@p8X20FEMcP0lihJekX8|o(j!`hx+NA8fQR^1vtXA*mS-Jz{lALY7b6 zVw>qnB6?;y3*SK4ulj51WSkTofVy=Jd>1&nU(4ya^RF3x5o$a*Usd4RR;{+(gnO5d)kTDT;+w1a6Rwc?RH zs*7O_ege8gyGZm47#J=DA=(v9V6ey7NNCRr9nc0y)E>CSxC@t~{el;M6p_vW3BB(@^SN}2u6i(1 z(>)J*Pi(lUw=yQ3Rh`Ov8aG1vf4Hk}X(!i&oYwU_cFvVotwu9(5$q@-Rw_c7~B}IFb+T2l){&m!Md*e zaHJhwDyDu#=T*NF^j^we`6-(AF^Z#j#-z=ia1&$OyFML*O*@TQNYToTIe|SJZ|DKu zcqnMwWNdw+GqHRZUC9<<*>-X-XL!U`?9dZVtTP^r>hLELaI(v-Pj)6h8&}27C`J!` z2}b31XQDIi#Dct2eR=Qgj79T)HhyKB%w$Q=B;Rp#y9d`)XQ~TNF$xcGVoKu#Ab85j zuTMAkA#DO~eB+m6NPE^9M%r{|$}w@=7|;54fvYsdM=N3Kwfey=ViaRS7s=ifGt7x?7HUVv)^ zh6Tv)_R@nD;iLc?bVB_Y z@()0-UP9Wh6KqmFfE_RXIf3qZ<&o(O|42u>pzg(sI1~u&ae5T- z4a|l?761oG5_2;6>R%!&+W$HH^mb*y7k}|JLvQrLzstTTURj?)&i@KPUMC)8IH2%` zTKmwS2M+=t!t?N?Ok8%a`)Qm9gZ19+Lk5m8jeCL&79ujjh3E)-GQx{S#CcQwB66yK z%b^WLea|X7HaG{00IjWx7PkvnN8wrGY^j2{LK_WjVu^(qnDsbeatIFSEj- zAVGA9o?VE(+NGv@E%1Uy39aX$PqaOP+Z*6_A^y?2W=(1S3acE$Q8y@paipl%n7wVa zxm9vb7~yL*r+vvDVQc-<@#)&zjWEhFmTN7Z2iwQWqpw-b- z$t)&I38oiQDb$xT3W*Y05(fM2lvzlOnxn~rNeFUi!F;J;j-v(s;shR?3e1q4qQTEJ-Oxg_X`g`#(I|X!o@$& z*m6PX-y!dL*ocu*8M<;D5EPYza2U2<9;yJBEIdzIjPmEpqv}V{Eq^>5hlp;|Q-2Ac z)NeBo;xZ+~C5pp($M@{*up?b0-62JTKVm>-sF+ndQQj_!b%4uE)TTn(?9tj%6Czt* zlmCFZvsf7Vq7geoK7{uniN}Td7(RmuUS9KyH6&Jjd{OPawbq%4vmQ6Erpqh#FH!O<<~dmjKS@ps`UBX$&$cOwvN-#fk@ex%kqH;!)qZ08PAm4Y$RF z_uWuW<>8tb*uT0;L-_l_F&LJ=*;(sSyS?Jg9y(N;sWp%^gIPP|U47_D&pSS|GGG>% zh0#1sWK)!(5F{}i%6iE@3yh2mcaLyI=~*zh{~LUfn8J_Kmg4^ zk73j096U4&cZoXOewjP)d*!GFL*LWw_QM~Pet6b88+eyQ)xdhi(s2ZqC3+qMp~E|q zX8ZAo!@+?kwZ(04_eR4=|i2rOF+AWq=(JZ%6qMt4(i%e6}(=n z^o+rL*PzM1AY6w`H!i=?F0L1>{TBu_Zm=sWTFlD*gQEL#5|ghX3kmihV4G&*Wauai z!@2frqgNc4KCfZXmAyMzcn4F***cs-ferBe<~#=$%0uvSsPp_6XcH9pz#zErz-1fb z5#$)Ky2IkpH-fZ)Y+S7raZYYqH9MngTjD2=3rJt6}>z#0MT6Gj}kt?1!-y{!W2|h8*km%6A6v zB0Oa;cbmiYD{?!RtF3Qk)1SoV5@?V>@v1o2U9HH$x)HsJf~V9D`nY~(bRY?I)i3ta zxU=GHl$MA?!fa88|vaLV~nIndMC!T6?3ASu6IW5fL4S^$F@jTL1*GBNd|%p76Sk4% zlGHY;^S&?%&HN~&s4>_ULP=_DW9$|@^P%&Fbk!MK9$(%zB?tGe)Y00_&Zw~CMZ`a| z6l=yiqwB-%3FOSeMv#ZBh$HYg@?Pu=cS=s?)jVc8yLQhhI7iWo37f94pl5gmlbrDZ zqr>A%a1fkX!kFXlk3Ot^20xulB>10&L~H`d`TH=V>Mnlq>or;L7#OwJIDwVJ#l! znWF``IlB6yB7c}?C87xA=GV1h+akCo2cwb>J$luNb2l^CW`tPNB`H};SZT~ zC}Z`>df0fioMNZL(SSod4+(JG;JLKg5YgHmWkbQ8nE~`#t{0~Y(V^Ahhu2)VuC;_| z;H|>Vg4!`^5*;azYh9p_T27BW#5&t8T(8W-DmYsl>5KZ4fgI?8dYx&#Y)tw?B&k1Q z@W%}Pl)={-e1pL^8GMVu8wl)S8EyfVGMu+1zLRvmMx1gG!9ij!{ap(@D?WC9d!#2RRvh=q-C`5ijn||0-P&&HT?;5J+W?z-inv_ z8SquuD;D8I6TS;Lm<)dd(H6nip@LHj2oW=JBT>hE1UCRwa3P88O!q`3Cx{4v^h-9V z+7&xy$|-oyLFK7$fWa9P@xr?rV*XIzl85V9p-A%c;1}SPr(edr%QYU&a~E(tHY~79 zj_P?(iMFq~xJSGH^ISQn?#5tILYG%Ig3$DQQ7K3(!_LfJH>rx0C%ZE?dSlc-d1-S6EAv@f~Ef0*(7_OnCaRO%d zn4f_L#iOIVKS~w5zG*&GUx+0pVo#Yr|IFF5fjvdrWrQGv=wcT|^B%r>8l$!ahaHGd zz=aSU(j>t!*Yx~Y?1`@7=FEw?Q|F(o96QS9v=>Z(IV|z>m^wSBm7*2+-Vg}81KPvJxIkPB{}|^Zc=s40#5JV2|@bUTXIp|Zd!OjXMmQMO)vY=;2JleK3u)XYSPwPPHJ208@t8TxrSpFxYvf7 z#3eZ`7@Sr;JmY8Rh09Ff+N-D;4A^V<5dj5pvBWZcaHccp)Pb;5P%z5|0;_D~zQM3& zKOF#Z=RS$`7|v|yQ6|zO9Aq2pkxHXsQszeMsv82HMcQleKA4#_2y+f;4~H<*y2DKm zFh>r5M(XY3V^LE0SV1(-BS;ZufEOdeCji&O;f4c&##4E=yoHpb3ivR~<~g0Ycyx#D z757u*ty@kXc@2#~&!C`qenG4N=>&n@O@j~b^ircR1*Ld{TE~E!Ub>lV^WdGt!Qdz0 zm0|KmE*N@f{LzLUUNgb62fJn-Ig_dvLRR0~!8TO_0gj7?^0GiE7uNVBHt=&(+Eh#q zU+%RD4IScT+J{ZkF$v(Pu+~E ziNL=JCoNyjw1J@QIQ(UBecjF2jf8kZrJE*Gmg0~o)0m=B40&oJfi#|}j4#1r_az(! z#)Kp}A!2dc*)i0^ofN1RWKeh)WqCfc>xY(sKO1BWIpIaJ|dbVD8XOdlSzYuu>E*e$L)ac#)2vlE~y9du(-h-1XD0hEE$2x=hW*k=%j zXvPuKVlaZh&Qpya7zlL*zk>?kb^zfu6nZqG zjkAn&VHG*Pi(Rk~F|*j8$TYv5Npgpp;%z)2;2GG4VFQjwbrVq(!%%$y#pOD2#2-Z* z?&@=MWpfUHT zzkL0F3@D9h4H21CMNUF-=TzKj6iK%bP?>&3&6U=n_2@(o4G-b#gLpjhVVM}{{0^Q% zF%kxDYck}vCThr1Y2(|ejPWfaYkbql<-Soo1;HwvDd!5YPv&;zGRBkRabpBpwKrKB z_hP=4L+Mm9_ogwX#U=Aw;TP7+8=&Ak5UDr!&AYz4I94=^reWenr~wFaZ=?>R9`f9` zW1ln*N}Mf0TpQh+J8vXu!sXYA6RGhqXKHtt&tSg%XnHh_o+WZ`8B^HJ1Xe-=e}^}X zbne@xr8pkt)9MHCgL!Y^EVGz6J$-jrR!p8aEA0uHS7(t;o8(g~wS~+0A2K;Cn8G!< zV80aB1}269>4?SvB8VWmDt7tWOIC^}^W8*gn>?M>69U!=?wJ z6nge@Nl7aAh-v2o~9~yDQPpJf?{oRkAUU u^<-jl$r<)&{@V}s-vWUJk&V-xhX44!(=Z`CnB2=jc7e`d(@L4q#Qz5$Ace00 literal 0 HcmV?d00001 diff --git a/vllm/engine/__pycache__/metrics.cpython-310.pyc b/vllm/engine/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94664be17a3c420f6f46b9a02f2d472975d680b5 GIT binary patch literal 6661 zcmb7JO^h5z74E<3`JbKrU)!L{xkl+v`Zk)J9a6t44#36~ez%?W`->d$apB+okqki@3 zy;ra6=e<|2YP!W@PQi2Vr+;bvG-!c<4;s5Mn3TywOJUeh~9 z&FG|Rsg7AQJLy`wlc{ASUw5*#EaY@2SIeU`oI@5va3#jK<9`!u6 z+fKLD;-X>(>2uw_>vIvLF4X%i9%P?wdw#bi>Yc!N&TV#s%!|)lc<#zeFTNTqv$j{? zbhzCRyzX;mcepRw4R52O1;+Vy!w-y$XpC=US>3NUoVw>V=&PfTj>jeSZ}V;a+6H)<(npwFoweJT7zkaOyr+;Qt2?lm;Dj2}{G;ys7gn^P6VSA2B~ zlW<=7_Uvn#ukS$lEln&jZAaTOZYW~OS9eo8+8r#=^^|Wiof#~3Tia3Ww4b4A-BDTQ zwvKuh^(@OJ^&IMXR!Hi3)QhZ?)C;K3uyRr_`Xx5Y=92mh>ho+Nsh9m(w#b%t)Rw}Q zf2p#S+XlWm2kJhyUs4Yw)Ok=JVh1JlP(oe67Y{Ql(YT2E!|X^>Ut$WT@#sd7vT5pS z#tp~m3}y+QZqyrBv6Nl6*=`LEw6WBpQ}^3l*XlN{Q6{Y}wGN7`+vs<=>r1i0KB>m- zcWlA0_qpeLw&=TV+ieYwZ}dBxT#%-iXEpjlpsllQMN$?%b>Z^M)ru}=uzI3Qpa?Kc zvv;%J>v1+XGDUO5v;3}g4*I4wzgchl)Un5>G>>@He3M%}(QRhj~a$I}XM~{Lw)@B%8Om%Y|IVaTSjaRSZR@XtAUgzunp)`I=ud4KJL5cyk-d>AADiVralpx_9)DwL;k`<1dO?^tTm`;;pp^ zxx3anC7<7{i8hb+7LGLX;p9htp z5$ub}!YFfw*9MJ_f+-diB+%d^Cx=s#x)q4w@QYz z%H3Am<*QM~Q|)aB8QYebluf?oy#Y|t7Sx=oeNWRgRZ}zfv=o)VKGlr>5i=hwXR4V` zv`om~gQt^r(Kid94$4vPRE;~jfpibOCNs6|AZOd1E}0tef^A>#*JXNT+k(>FK!=fw zCxFS>8Ue?NRsf)g1=ARUE2Wb74U z$u?3_n#<5;C~ge!dZZzznX0MNyP^)}hf|rvC3J4P#{<2)x%J?=I08*L*QnPtF|(s= zDO>6;PISBa9feMEG~Q^;wZhop#SmXdOYsmuV9?%MD@3EfiZr|NP@X5Div%WyatX8_ z;PuFqGwPCR+#k%041$ydMPFdIz=_zSNFz=`^uZ&cB#BO`=)ZbLr8V5rSCj)-y(`LH zbpx5_8YBYE?I8&FJK|Z?;%;sLm+B|F38!vxB%#Uf_d^JJbWp~e?;q*zgxH?5G&Pa7 zN4pwX+?KMUyoQ+ure4FLTzmtJiY9v{4Iu6s4Z2p~PQ3^Bkm5E{uAj{j}KwRhAv)Lnf?-7?5vBsVpM zYbMwq_D;c?x${a@DOlTx*oE9OMA=6LJgDrG^7- zi&74k7?4}qBd3$L>)(UivXsNM1LRir$Qe@ZO>(Te+J;=t%D&K_4q~chkbsTiismV* z(v6PzJb}XmK1aYJ@C5=76Zj$la%I|d8ezOSN;DaCzD(F70JU^?(?hi6;tY|F5%>y$ zM*%9SF#5a%yk_`*(+BKzedH8R@=2SUp_?wb^&&9yzV{ z8d>LF%(yr1U*n$(X5(Ey@l535BZu#V{NQ*&Qy>=z_ga(2EX{%z&)CD0H(a!I;$fq?O6OXPy z(t6B&Y0sl0uyo(|f$S)|h(FQzWUSnZYdS^FEosD%GJvP8OlM!Jaq8mb9I~+*;xKr%~`|eVz=c{9ziw}nyXACqRqvU zLC@mwifs!gl<@whYE9D+<{ptOor0?4k#uq>Z^H$u6L+Xh_GfpQ>{3Gdj#yU6e& z?X>j~gKZ3!j~%D0K3toPb3ay(ZB)wQn~;&QB{1pM&}CtGxk6;xro@W`z6F3}OeXw- zw)rrM$p9u}mke7nV#zJo7>1qUWKH^t4t>9eF7K-VIkTjf)PkPT4b{9)nKMB;s+sqU zR8BgJp&9oK-T2fnj8AeY<3Bmm{Mfi}=1g*?4+o_YM}q3g{6uyy&QixIVGvmYCkasc z6_`k;a5WK*EJx(03DD33W3$_Jgq#mb+rtT?`4HbBF!XG&1-2J73Ai2s?El;Or7#;n~AT-cwG7^SXZ8w}YE=T3ngS`Eaw+Zro zL|&|j3>t{90R$-=lX~0YGT|oDL|8~TP#RH?PyT1%p2$-pdG@ghlgX4M@>URF>nHG_33WA-zesg=^&3uSd~{=WhA_Z9#E literal 0 HcmV?d00001 diff --git a/vllm/engine/__pycache__/ray_utils.cpython-310.pyc b/vllm/engine/__pycache__/ray_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2b209583078201704cfabdb5d39097a574aac29 GIT binary patch literal 4764 zcmb_g&2QYs73c74xu23HS+>)pVbUV$7U?QZ&?X4dz^WxVY6HcH9S332($voEiZYj6 z54lpbSOGz0pr`cOLs38ma>-v*fR`S6N{{WW%}0N4$X)Ns4SMMc9CBvfym|9`pQEJN ztZDc?_s75VzgpL{zfouYN3%EZq<=@lHO?Z$10eOXD`LJkogOO!qE*q{Yvh znpiox$24!1Y2#I~biT$NvHB;SSO080X5h=bWjvt$n7&tky&7A^`!=Fa=W zEKK4c%FXw}G|R2KqhTcS=G~8PZ~Hgj-M;y&-Mw3FmbY#PGKeA(-Av+M*vCjcjKj>2 zl73&vwvpRm>K}x|{AwyPzdPcAe-x(Sek6P@j>4`;bGwh3aM-1-=YJMDH}Iq}8licN zYo3mS8vHU}=N7jiw#h5pd8Bz3uksq+Hm~yr-W6^Lr>FBKZ#^=-Dta~Hh`MOt$T~fI zi7#WO2~Grn>9qjT8dffG6F^+an;!%xA0_gEkoTf!Fg`)oJ(j_6D5TphqF@|_CoaJg22nT`+zome*b!@>le#xQ+}iZ-?cUk_ z#e3WS*7m*Ko7;CfJGm`IHj?pPR|gQ&pMekR-^7!?gl58IoimuIHejbZ(IU5d5I)&ZvHC)jm%#B~H#SD8 z+=!BH5N$-^{>JbmJ4oW!JJ(;`NW)D0WEgZG1bvZi5bzr!?uW71kikhIRcCmTJHD^> z@mIhvB^NeXgW342S>yEub#|tt3y>0=Ge{fgOxPL02N3UQU$DJ4lP!$4b-9d}*D5e5 zHtogg7)3(@lzvdqJcogQ66*#VUt5$!iCj=UB@F3U>6e7bAj%5E*49KJ?)|_b2JOGkSCgW-p7+(MMJWXB&X)Yoamu8F+OL1 zKmaM^V)6=T+LnSTw+8~6Lfj06rfJwC{DOQOnMpfDe29AN^f&i3mcQp zW=8ZZI+=FP9&6|NW9^jT+rV3&u!-KA>Cwy$7_zrYEafQ9!h!I+Nt}sKGH*^AUUd4y zkstE3Z4_2mDMaO7rJT5Up0$~zLCpP<=*M(0f~CP6{Ec5+1opozS+EE?;p~*9b+R)` zPIDkvK{R!iS{W7~`2_(bk!|29O!B2gNxmyYzK-YJ@2)bDbP}g@%_B!PjiExH+=u$D2U3LitqEJ>-%yQi{xc$u2J)SG`W!? zWIaz^(wrpKm|3e8#9*4MS0k zpV+*?~L0)p2)_!8z(08SalVkcDwiND+6%D=x~MAQc!%GvxCp z%kB@u;cOKCdcUK}pD5^xfrztPG8qkb@}*%}50M9>Aj(@ss`5vv@UbedqwxdIrASkS zsO;b%UBY1cBb+JZDs#SpCuKGS47V~3TA&Mk0pxc!;Iw1;mzJrZHmm4@}9k2CvcjNmU+BD!lTB zc3OFOm-tU;KEr%n&0EC(yoqo*DenF#<`L{Gi0NDh!I5wW$PEBDh~4Wh z)^x_dB@2WA`>-HDJKGeKNLQb9h2VuYvIJ3WjxsnxoTO|E43yb$XDrUMPiSVBJvxE5xTdz@o>yweDdF@^hjfC38yHv^uC6^}ZEnQfhTT%r4 zq}`CJe0dd8$m`Vn42`!`R0;k8Xejb~SH4S#N+Cpw`VO_-v||?^UcD5NOv_uH8X0aK zekq}M`GQ=gk`>N-=jMKrMDi6{O2W_KI;%=@=k4MW{SNtl`X(Aj wyQ(+z23uo}t^=ACy|uBzOklZRlUyX6vi}0DRaH#Q*>R literal 0 HcmV?d00001 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py new file mode 100644 index 0000000..a5c824d --- /dev/null +++ b/vllm/engine/arg_utils.py @@ -0,0 +1,341 @@ +import argparse +import dataclasses +from dataclasses import dataclass +from typing import Optional, Tuple + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) + + +@dataclass +class EngineArgs: + """Arguments for vLLM engine.""" + model: str + tokenizer: Optional[str] = None + tokenizer_mode: str = 'auto' + trust_remote_code: bool = False + download_dir: Optional[str] = None + load_format: str = 'auto' + dtype: str = 'auto' + kv_cache_dtype: str = 'auto' + seed: int = 0 + max_model_len: Optional[int] = None + worker_use_ray: bool = False + pipeline_parallel_size: int = 1 + tensor_parallel_size: int = 1 + max_parallel_loading_workers: Optional[int] = None + block_size: int = 16 + swap_space: int = 4 # GiB + gpu_memory_utilization: float = 0.90 + max_num_batched_tokens: Optional[int] = None + max_num_seqs: int = 256 + max_paddings: int = 256 + disable_log_stats: bool = False + revision: Optional[str] = None + code_revision: Optional[str] = None + tokenizer_revision: Optional[str] = None + quantization: Optional[str] = None + enforce_eager: bool = False + max_context_len_to_capture: int = 8192 + disable_custom_all_reduce: bool = False + enable_lora: bool = False + max_loras: int = 1 + max_lora_rank: int = 16 + lora_extra_vocab_size: int = 256 + lora_dtype = 'auto' + max_cpu_loras: Optional[int] = None + device: str = 'auto' + + def __post_init__(self): + if self.tokenizer is None: + self.tokenizer = self.model + + @staticmethod + def add_cli_args( + parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + """Shared CLI arguments for vLLM engine.""" + + # NOTE: If you update any of the arguments below, please also + # make sure to update docs/source/models/engine_args.rst + + # Model arguments + parser.add_argument( + '--model', + type=str, + default='facebook/opt-125m', + help='name or path of the huggingface model to use') + parser.add_argument( + '--tokenizer', + type=str, + default=EngineArgs.tokenizer, + help='name or path of the huggingface tokenizer to use') + parser.add_argument( + '--revision', + type=str, + default=None, + help='the specific model version to use. It can be a branch ' + 'name, a tag name, or a commit id. If unspecified, will use ' + 'the default version.') + parser.add_argument( + '--code-revision', + type=str, + default=None, + help='the specific revision to use for the model code on ' + 'Hugging Face Hub. It can be a branch name, a tag name, or a ' + 'commit id. If unspecified, will use the default version.') + parser.add_argument( + '--tokenizer-revision', + type=str, + default=None, + help='the specific tokenizer version to use. It can be a branch ' + 'name, a tag name, or a commit id. If unspecified, will use ' + 'the default version.') + parser.add_argument('--tokenizer-mode', + type=str, + default=EngineArgs.tokenizer_mode, + choices=['auto', 'slow'], + help='tokenizer mode. "auto" will use the fast ' + 'tokenizer if available, and "slow" will ' + 'always use the slow tokenizer.') + parser.add_argument('--trust-remote-code', + action='store_true', + help='trust remote code from huggingface') + parser.add_argument('--download-dir', + type=str, + default=EngineArgs.download_dir, + help='directory to download and load the weights, ' + 'default to the default cache dir of ' + 'huggingface') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'], + help='The format of the model weights to load. ' + '"auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available. ' + '"pt" will load the weights in the pytorch bin format. ' + '"safetensors" will load the weights in the safetensors format. ' + '"npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading. ' + '"dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.') + parser.add_argument( + '--dtype', + type=str, + default=EngineArgs.dtype, + choices=[ + 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' + ], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') + parser.add_argument( + '--kv-cache-dtype', + type=str, + choices=['auto', 'fp8_e5m2'], + default=EngineArgs.kv_cache_dtype, + help='Data type for kv cache storage. If "auto", will use model ' + 'data type. Note FP8 is not supported when cuda version is ' + 'lower than 11.8.') + parser.add_argument('--max-model-len', + type=int, + default=EngineArgs.max_model_len, + help='model context length. If unspecified, ' + 'will be automatically derived from the model.') + # Parallel arguments + parser.add_argument('--worker-use-ray', + action='store_true', + help='use Ray for distributed serving, will be ' + 'automatically set when using more than 1 GPU') + parser.add_argument('--pipeline-parallel-size', + '-pp', + type=int, + default=EngineArgs.pipeline_parallel_size, + help='number of pipeline stages') + parser.add_argument('--tensor-parallel-size', + '-tp', + type=int, + default=EngineArgs.tensor_parallel_size, + help='number of tensor parallel replicas') + parser.add_argument( + '--max-parallel-loading-workers', + type=int, + default=EngineArgs.max_parallel_loading_workers, + help='load model sequentially in multiple batches, ' + 'to avoid RAM OOM when using tensor ' + 'parallel and large models') + # KV cache arguments + parser.add_argument('--block-size', + type=int, + default=EngineArgs.block_size, + choices=[16], + help='token block size') + parser.add_argument('--seed', + type=int, + default=EngineArgs.seed, + help='random seed') + parser.add_argument('--swap-space', + type=int, + default=EngineArgs.swap_space, + help='CPU swap space size (GiB) per GPU') + parser.add_argument( + '--gpu-memory-utilization', + type=float, + default=EngineArgs.gpu_memory_utilization, + help='the fraction of GPU memory to be used for ' + 'the model executor, which can range from 0 to 1.' + 'If unspecified, will use the default value of 0.9.') + parser.add_argument('--max-num-batched-tokens', + type=int, + default=EngineArgs.max_num_batched_tokens, + help='maximum number of batched tokens per ' + 'iteration') + parser.add_argument('--max-num-seqs', + type=int, + default=EngineArgs.max_num_seqs, + help='maximum number of sequences per iteration') + parser.add_argument('--max-paddings', + type=int, + default=EngineArgs.max_paddings, + help='maximum number of paddings in a batch') + parser.add_argument('--disable-log-stats', + action='store_true', + help='disable logging statistics') + # Quantization settings. + parser.add_argument('--quantization', + '-q', + type=str, + choices=['awq', 'gptq', 'squeezellm', 'smoothquant',None], + default=EngineArgs.quantization, + help='Method used to quantize the weights. If ' + 'None, we first check the `quantization_config` ' + 'attribute in the model config file. If that is ' + 'None, we assume the model weights are not ' + 'quantized and use `dtype` to determine the data ' + 'type of the weights.') + parser.add_argument('--enforce-eager', + action='store_true', + help='Always use eager-mode PyTorch. If False, ' + 'will use eager mode and CUDA graph in hybrid ' + 'for maximal performance and flexibility.') + parser.add_argument('--max-context-len-to-capture', + type=int, + default=EngineArgs.max_context_len_to_capture, + help='maximum context length covered by CUDA ' + 'graphs. When a sequence has context length ' + 'larger than this, we fall back to eager mode.') + parser.add_argument('--disable-custom-all-reduce', + action='store_true', + default=EngineArgs.disable_custom_all_reduce, + help='See ParallelConfig') + # LoRA related configs + parser.add_argument('--enable-lora', + action='store_true', + help='If True, enable handling of LoRA adapters.') + parser.add_argument('--max-loras', + type=int, + default=EngineArgs.max_loras, + help='Max number of LoRAs in a single batch.') + parser.add_argument('--max-lora-rank', + type=int, + default=EngineArgs.max_lora_rank, + help='Max LoRA rank.') + parser.add_argument( + '--lora-extra-vocab-size', + type=int, + default=EngineArgs.lora_extra_vocab_size, + help=('Maximum size of extra vocabulary that can be ' + 'present in a LoRA adapter (added to the base ' + 'model vocabulary).')) + parser.add_argument( + '--lora-dtype', + type=str, + default=EngineArgs.lora_dtype, + choices=['auto', 'float16', 'bfloat16', 'float32'], + help=('Data type for LoRA. If auto, will default to ' + 'base model dtype.')) + parser.add_argument( + '--max-cpu-loras', + type=int, + default=EngineArgs.max_cpu_loras, + help=('Maximum number of LoRAs to store in CPU memory. ' + 'Must be >= than max_num_seqs. ' + 'Defaults to max_num_seqs.')) + parser.add_argument("--device", + type=str, + default=EngineArgs.device, + choices=["auto", "cuda", "neuron"], + help='Device type for vLLM execution.') + return parser + + @classmethod + def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs': + # Get the list of attributes of this dataclass. + attrs = [attr.name for attr in dataclasses.fields(cls)] + # Set the attributes from the parsed arguments. + engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) + return engine_args + + def create_engine_configs( + self, + ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, + DeviceConfig, Optional[LoRAConfig]]: + device_config = DeviceConfig(self.device) + model_config = ModelConfig( + self.model, self.tokenizer, self.tokenizer_mode, + self.trust_remote_code, self.download_dir, self.load_format, + self.dtype, self.seed, self.revision, self.code_revision, + self.tokenizer_revision, self.max_model_len, self.quantization, + self.enforce_eager, self.max_context_len_to_capture) + cache_config = CacheConfig(self.block_size, + self.gpu_memory_utilization, + self.swap_space, self.kv_cache_dtype, + model_config.get_sliding_window()) + parallel_config = ParallelConfig(self.pipeline_parallel_size, + self.tensor_parallel_size, + self.worker_use_ray, + self.max_parallel_loading_workers, + self.disable_custom_all_reduce) + scheduler_config = SchedulerConfig(self.max_num_batched_tokens, + self.max_num_seqs, + model_config.max_model_len, + self.max_paddings) + lora_config = LoRAConfig( + max_lora_rank=self.max_lora_rank, + max_loras=self.max_loras, + lora_extra_vocab_size=self.lora_extra_vocab_size, + lora_dtype=self.lora_dtype, + max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras + and self.max_cpu_loras > 0 else None) if self.enable_lora else None + return (model_config, cache_config, parallel_config, scheduler_config, + device_config, lora_config) + + +@dataclass +class AsyncEngineArgs(EngineArgs): + """Arguments for asynchronous vLLM engine.""" + engine_use_ray: bool = False + disable_log_requests: bool = False + max_log_len: Optional[int] = None + + @staticmethod + def add_cli_args( + parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser = EngineArgs.add_cli_args(parser) + parser.add_argument('--engine-use-ray', + action='store_true', + help='use Ray to start the LLM engine in a ' + 'separate process as the server process.') + parser.add_argument('--disable-log-requests', + action='store_true', + help='disable logging requests') + parser.add_argument('--max-log-len', + type=int, + default=None, + help='max number of prompt characters or prompt ' + 'ID numbers being printed in log. ' + 'Default: unlimited.') + return parser diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py new file mode 100644 index 0000000..530314a --- /dev/null +++ b/vllm/engine/async_llm_engine.py @@ -0,0 +1,689 @@ +import asyncio +import time +from functools import partial +from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, + Union, AsyncIterator) + +from vllm.lora.request import LoRARequest +from vllm.config import ModelConfig +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.llm_engine import LLMEngine +from vllm.engine.ray_utils import initialize_cluster, ray +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams + +logger = init_logger(__name__) + + +class AsyncEngineDeadError(RuntimeError): + pass + + +def _raise_exception_on_finish(task: asyncio.Task, + request_tracker: "RequestTracker") -> None: + msg = ("Task finished unexpectedly. This should never happen! " + "Please open an issue on Github.") + try: + try: + task.result() + except asyncio.CancelledError: + return + except Exception as exc: + raise AsyncEngineDeadError( + msg + " See stack trace above for the actual cause.") from exc + raise AsyncEngineDeadError(msg) + except Exception as exc: + request_tracker.propagate_exception(exc) + raise exc + + +class AsyncStream: + """A stream of RequestOutputs for a request that can be + iterated over asynchronously.""" + + def __init__(self, request_id: str) -> None: + self.request_id = request_id + self._queue = asyncio.Queue() + self._finished = False + + def put(self, item: RequestOutput) -> None: + if self._finished: + return + self._queue.put_nowait(item) + + def finish(self) -> None: + self._queue.put_nowait(StopAsyncIteration()) + self._finished = True + + @property + def finished(self) -> bool: + return self._finished + + def __aiter__(self): + return self + + async def __anext__(self) -> RequestOutput: + result = await self._queue.get() + if isinstance(result, Exception): + raise result + return result + + +class RequestTracker: + """Synchronous abstraction for tracking requests.""" + + def __init__(self) -> None: + self._request_streams: Dict[str, AsyncStream] = {} + self._finished_requests: asyncio.Queue[str] = asyncio.Queue() + self._new_requests: asyncio.Queue[Tuple[AsyncStream, + dict]] = asyncio.Queue() + self.new_requests_event = None + + def __contains__(self, item): + return item in self._request_streams + + def init_event(self): + self.new_requests_event = asyncio.Event() + + def propagate_exception(self, + exc: Exception, + request_id: Optional[str] = None) -> None: + """Propagate an exception to request streams + (all if request_id is None).""" + if request_id is not None: + self._request_streams[request_id].put(exc) + else: + for stream in self._request_streams.values(): + stream.put(exc) + + def process_request_output(self, + request_output: RequestOutput, + *, + verbose: bool = False) -> None: + """Process a request output from the engine.""" + request_id = request_output.request_id + + self._request_streams[request_id].put(request_output) + if request_output.finished: + if verbose: + logger.info(f"Finished request {request_id}.") + self.abort_request(request_id) + + def add_request(self, request_id: str, + **engine_add_request_kwargs) -> AsyncStream: + """Add a request to be sent to the engine on the next background + loop iteration.""" + if request_id in self._request_streams: + raise KeyError(f"Request {request_id} already exists.") + + stream = AsyncStream(request_id) + self._new_requests.put_nowait((stream, { + "request_id": request_id, + **engine_add_request_kwargs + })) + + self.new_requests_event.set() + + return stream + + def abort_request(self, request_id: str, *, verbose: bool = False) -> None: + """Abort a request during next background loop iteration.""" + if verbose: + logger.info(f"Aborted request {request_id}.") + + self._finished_requests.put_nowait(request_id) + + if request_id not in self._request_streams or self._request_streams[ + request_id].finished: + # The request has already finished or been aborted. + return + + self._request_streams[request_id].finish() + + def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]: + """Get the new requests and finished requests to be + sent to the engine.""" + new_requests: List[Dict] = [] + finished_requests: Set[str] = set() + + while not self._finished_requests.empty(): + request_id = self._finished_requests.get_nowait() + finished_requests.add(request_id) + self._request_streams.pop(request_id, None) + + while not self._new_requests.empty(): + stream, new_request = self._new_requests.get_nowait() + if stream.request_id in finished_requests: + # The request has already been aborted. + stream.finish() + continue + self._request_streams[stream.request_id] = stream + new_requests.append(new_request) + + self.new_requests_event.clear() + + return new_requests, finished_requests + + async def wait_for_new_requests(self): + await self.new_requests_event.wait() + + +class _AsyncLLMEngine(LLMEngine): + """Extension of LLMEngine to add async methods.""" + + async def step_async(self) -> List[RequestOutput]: + """Performs one decoding iteration and returns newly generated results. + The workers are ran asynchronously if possible. + + This function performs one decoding iteration of the engine. It first + schedules the sequences to be executed in the next iteration and the + token blocks to be swapped in/out/copy. Then, it executes the model + and updates the scheduler with the model outputs. Finally, it decodes + the sequences and returns the newly generated results. + """ + seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() + + # Execute the model. + output = (await self._run_workers_async( + "execute_model", + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, + blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, + blocks_to_copy=scheduler_outputs.blocks_to_copy, + )) if not scheduler_outputs.is_empty() else [] + + return self._process_model_outputs(output, scheduler_outputs) + + # TODO align + """ + seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() + + if not scheduler_outputs.is_empty(): + # Execute the model. + all_outputs = await self._run_workers_async( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, + "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, + "blocks_to_copy": scheduler_outputs.blocks_to_copy, + }) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + else: + output = [] + + return self._process_model_outputs(output, scheduler_outputs) + """ + + async def encode_request_async( + self, + request_id: str, # pylint: disable=unused-argument + prompt: Optional[str], + prompt_token_ids: Optional[List[int]] = None, + lora_request: Optional[LoRARequest] = None, + ): + if prompt_token_ids is None: + assert prompt is not None + prompt_token_ids = await self.tokenizer.encode_async( + request_id=request_id, + prompt=prompt, + lora_request=lora_request) + return prompt_token_ids + + async def add_request_async( + self, + request_id: str, + prompt: Optional[str], + sampling_params: SamplingParams, + prompt_token_ids: Optional[List[int]] = None, + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, + ) -> None: + if lora_request is not None and not self.lora_config: + raise ValueError(f"Got lora_request {lora_request} but LoRA is " + "not enabled!") + if arrival_time is None: + arrival_time = time.time() + prompt_token_ids = await self.encode_request_async( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + lora_request=lora_request) + + return self.add_request( + request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params, + arrival_time=arrival_time, + lora_request=lora_request, + prefix_pos=prefix_pos, + ) + + async def _run_workers_async( + self, + method: str, + *args, + get_all_outputs: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + coros = [] + for worker in self.workers: + if self.parallel_config.worker_use_ray: + coros.append( + worker.execute_method.remote(method, *args, **kwargs)) + else: + executor = getattr(worker, method) + coros.append(asyncio.get_event_loop().run_in_executor( + None, partial(executor, *args, **kwargs))) + + all_outputs = await asyncio.gather(*coros) + + if get_all_outputs: + return all_outputs + + # Make sure all workers have the same results. + output = all_outputs[0] + for other_output in all_outputs[1:]: + assert output == other_output + return output + + # TODO align + """ + async def _run_workers_async( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Any: + coros = [] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Run the driver worker asynchronously. + driver_executor = getattr(self.driver_worker, method) + coros.append(asyncio.get_event_loop().run_in_executor( + None, partial(driver_executor, *driver_args, **driver_kwargs))) + + # Run the ray workers asynchronously. + for worker in self.workers: + coros.append(worker.execute_method.remote(method, *args, **kwargs)) + + all_outputs = await asyncio.gather(*coros) + return all_outputs + """ + + +class AsyncLLMEngine: + """An asynchronous wrapper for LLMEngine. + + This class is used to wrap the LLMEngine class to make it asynchronous. It + uses asyncio to create a background loop that keeps processing incoming + requests. The LLMEngine is kicked by the generate method when there + are requests in the waiting queue. The generate method yields the outputs + from the LLMEngine to the caller. + + NOTE: For the comprehensive list of arguments, see `LLMEngine`. + + Args: + worker_use_ray: Whether to use Ray for model workers. Required for + distributed execution. Should be the same as + `parallel_config.worker_use_ray`. + engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the + async frontend will be executed in a separate process as the + model workers. + log_requests: Whether to log the requests. + max_log_len: Maximum number of prompt characters or prompt ID numbers + being printed in log. + start_engine_loop: If True, the background task to run the engine + will be automatically started in the generate call. + *args: Arguments for LLMEngine. + *kwargs: Arguments for LLMEngine. + """ + + _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine + + def __init__(self, + worker_use_ray: bool, + engine_use_ray: bool, + *args, + log_requests: bool = True, + max_log_len: Optional[int] = None, + start_engine_loop: bool = True, + **kwargs) -> None: + self.worker_use_ray = worker_use_ray + self.engine_use_ray = engine_use_ray + self.log_requests = log_requests + self.max_log_len = max_log_len + self.engine = self._init_engine(*args, **kwargs) + + self.background_loop = None + # We need to keep a reference to unshielded + # task as well to prevent it from being garbage + # collected + self._background_loop_unshielded = None + self.start_engine_loop = start_engine_loop + self._request_tracker = RequestTracker() + + @property + def is_running(self) -> bool: + return (self.background_loop is not None + and not self.background_loop.done()) + + def get_tokenizer(self): + return self.engine.tokenizer.tokenizer + + def start_background_loop(self) -> None: + """Start the background loop.""" + if self.is_running: + raise RuntimeError("Background loop is already running.") + self._request_tracker.init_event() + + self._background_loop_unshielded = asyncio.get_event_loop( + ).create_task(self.run_engine_loop()) + self._background_loop_unshielded.add_done_callback( + partial(_raise_exception_on_finish, + request_tracker=self._request_tracker)) + self.background_loop = asyncio.shield(self._background_loop_unshielded) + + def _init_engine(self, *args, + **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]: + if not self.engine_use_ray: + engine_class = self._engine_class + elif self.worker_use_ray: + engine_class = ray.remote(num_cpus=0)(self._engine_class).remote + else: + # FIXME(woosuk): This is a bit hacky. Be careful when changing the + # order of the arguments. + cache_config = args[1] + parallel_config = args[2] + if parallel_config.tensor_parallel_size == 1: + num_gpus = cache_config.gpu_memory_utilization + else: + num_gpus = 1 + engine_class = ray.remote(num_gpus=num_gpus)( + self._engine_class).remote + return engine_class(*args, **kwargs) + + async def engine_step(self) -> bool: + """Kick the engine to process the waiting requests. + + Returns True if there are in-progress requests.""" + + new_requests, finished_requests = ( + self._request_tracker.get_new_and_finished_requests()) + + for new_request in new_requests: + # Add the request into the vLLM engine's waiting queue. + # TODO: Maybe add add_request_batch to reduce Ray overhead + if self.engine_use_ray: + await self.engine.add_request.remote(**new_request) + else: + await self.engine.add_request_async(**new_request) + + if finished_requests: + await self._engine_abort(finished_requests) + + if self.engine_use_ray: + request_outputs = await self.engine.step.remote() + else: + request_outputs = await self.engine.step_async() + + # Put the outputs into the corresponding streams. + for request_output in request_outputs: + self._request_tracker.process_request_output( + request_output, verbose=self.log_requests) + + return len(request_outputs) > 0 + + async def _engine_abort(self, request_ids: Iterable[str]): + if self.engine_use_ray: + await self.engine.abort_request.remote(request_ids) + else: + self.engine.abort_request(request_ids) + + async def run_engine_loop(self): + # Initialize the RequestTracker here so it uses the right event loop. + has_requests_in_progress = False + while True: + if not has_requests_in_progress: + await self._request_tracker.wait_for_new_requests() + has_requests_in_progress = await self.engine_step() + await asyncio.sleep(0) + + async def add_request( + self, + request_id: str, + prompt: Optional[str], + sampling_params: SamplingParams, + prompt_token_ids: Optional[List[int]] = None, + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, + ) -> AsyncStream: + if self.log_requests: + shortened_prompt = prompt + shortened_token_ids = prompt_token_ids + if self.max_log_len is not None: + if shortened_prompt is not None: + shortened_prompt = shortened_prompt[:self.max_log_len] + if shortened_token_ids is not None: + shortened_token_ids = shortened_token_ids[:self. + max_log_len] + logger.info(f"Received request {request_id}: " + f"prompt: {shortened_prompt!r}, " + f"prefix_pos: {prefix_pos}," + f"sampling_params: {sampling_params}, " + f"prompt_token_ids: {shortened_token_ids}, " + f"lora_request: {lora_request}.") + + if not self.is_running: + if self.start_engine_loop: + self.start_background_loop() + else: + raise AsyncEngineDeadError( + "Background loop is not running. If it was running, " + "inspect the output to find the stacktrace of the " + "error that caused the background loop to stop " + "(AsyncEngineDeadError).") + + if arrival_time is None: + arrival_time = time.time() + + if self.engine_use_ray: + prompt_token_ids = await self.engine.encode_request_async.remote( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + lora_request=lora_request) + else: + prompt_token_ids = await self.engine.encode_request_async( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + lora_request=lora_request) + + stream = self._request_tracker.add_request( + request_id, + prompt=prompt, + sampling_params=sampling_params, + prompt_token_ids=prompt_token_ids, + arrival_time=arrival_time, + lora_request=lora_request, + prefix_pos=prefix_pos) + + return stream + + async def generate( + self, + prompt: Optional[str], + sampling_params: SamplingParams, + request_id: str, + prompt_token_ids: Optional[List[int]] = None, + lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, + ) -> AsyncIterator[RequestOutput]: + """Generate outputs for a request. + + Generate outputs for a request. This method is a coroutine. It adds the + request into the waiting queue of the LLMEngine and streams the outputs + from the LLMEngine to the caller. + + Args: + prompt: The prompt string. Can be None if prompt_token_ids is + provided. + sampling_params: The sampling parameters of the request. + request_id: The unique id of the request. + prompt_token_ids: The token IDs of the prompt. If None, we + use the tokenizer to convert the prompts to token IDs. + lora_request: LoRA request to use for generation, if any. + prefix_pos: If not None, we use the given position as the prefix + position for each prompt. We will cache the prefix's KV + cache and reuse it for the next request with the same prefix. + This is an experimental feature, and may be replaced with + automatic prefix caching in the future. + + Yields: + The output `RequestOutput` objects from the LLMEngine for the + request. + + Details: + - If the engine is not running, start the background loop, + which iteratively invokes + :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` + to process the waiting requests. + - Add the request to the engine's `RequestTracker`. + On the next background loop, this request will be sent to + the underlying engine. + Also, a corresponding `AsyncStream` will be created. + - Wait for the request outputs from `AsyncStream` and yield them. + + Example: + >>> # Please refer to entrypoints/api_server.py for + >>> # the complete example. + >>> + >>> # initialize the engine and the example input + >>> engine = AsyncLLMEngine.from_engine_args(engine_args) + >>> example_input = { + >>> "prompt": "What is LLM?", + >>> "stream": False, # assume the non-streaming case + >>> "temperature": 0.0, + >>> "request_id": 0, + >>> } + >>> + >>> # start the generation + >>> results_generator = engine.generate( + >>> example_input["prompt"], + >>> SamplingParams(temperature=example_input["temperature"]), + >>> example_input["request_id"]) + >>> + >>> # get the results + >>> final_output = None + >>> async for request_output in results_generator: + >>> if await request.is_disconnected(): + >>> # Abort the request if the client disconnects. + >>> await engine.abort(request_id) + >>> # Return or raise an error + >>> ... + >>> final_output = request_output + >>> + >>> # Process and return the final output + >>> ... + """ + # Preprocess the request. + # This should not be used for logging, as it is monotonic time. + arrival_time = time.monotonic() + + try: + stream = await self.add_request( + request_id, + prompt, + sampling_params, + prompt_token_ids=prompt_token_ids, + arrival_time=arrival_time, + lora_request=lora_request, + prefix_pos=prefix_pos, + ) + + async for request_output in stream: + yield request_output + except (Exception, asyncio.CancelledError) as e: + # If there is an exception or coroutine is cancelled, abort the + # request. + self._abort(request_id) + raise e + + async def abort(self, request_id: str) -> None: + """Abort a request. + + Abort a submitted request. If the request is finished or not found, + this method will be a no-op. + + Args: + request_id: The unique id of the request. + """ + if not self.is_running: + raise AsyncEngineDeadError( + "Background loop is not running. If it was running, " + "inspect the output to find the stacktrace of the " + "error that caused the background loop to stop " + "(AsyncEngineDeadError).") + + return self._abort(request_id) + + def _abort(self, request_id: str) -> None: + """Abort a request. + + Abort a submitted request. If the request is finished or not found, + this method will be a no-op. + + Args: + request_id: The unique id of the request. + """ + self._request_tracker.abort_request(request_id, + verbose=self.log_requests) + + async def get_model_config(self) -> ModelConfig: + """Get the model configuration of the vLLM engine.""" + if self.engine_use_ray: + return await self.engine.get_model_config.remote() + else: + return self.engine.get_model_config() + + @classmethod + def from_engine_args(cls, + engine_args: AsyncEngineArgs, + start_engine_loop: bool = True) -> "AsyncLLMEngine": + """Creates an async LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + # Initialize the cluster. + placement_group = initialize_cluster(parallel_config, + engine_args.engine_use_ray) + # Create the async LLM engine. + engine = cls(parallel_config.worker_use_ray, + engine_args.engine_use_ray, + *engine_configs, + placement_group, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + max_log_len=engine_args.max_log_len, + start_engine_loop=start_engine_loop) + return engine + + async def do_log_stats(self) -> None: + if self.engine_use_ray: + await self.engine.do_log_stats.remote() + else: + self.engine.do_log_stats() \ No newline at end of file diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py new file mode 100644 index 0000000..8a5d447 --- /dev/null +++ b/vllm/engine/llm_engine.py @@ -0,0 +1,1209 @@ +import copy +from collections import defaultdict +from functools import partial +import os +import time +import pickle +import importlib +from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, + Union) + +from vllm.lora.request import LoRARequest +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, + SequenceGroupOutput, SequenceOutput, SequenceStatus) +from vllm.transformers_utils.tokenizer import (detokenize_incrementally, + TokenizerGroup) +from vllm.utils import (Counter, set_cuda_visible_devices, get_ip, + get_open_port, get_distributed_init_method) + +if ray: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) +_LOCAL_LOGGING_INTERVAL_SEC = 5 + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class LLMEngine: + """An LLM engine that receives requests and generates texts. + + This is the main class for the vLLM engine. It receives requests + from clients and generates texts from the LLM. It includes a tokenizer, a + language model (possibly distributed across multiple GPUs), and GPU memory + space allocated for intermediate states (aka KV cache). This class utilizes + iteration-level scheduling and efficient memory management to maximize the + serving throughput. + + The `LLM` class wraps this class for offline batched inference and the + `AsyncLLMEngine` class wraps this class for online serving. + + NOTE: The config arguments are derived from the `EngineArgs` class. For the + comprehensive list of arguments, see `EngineArgs`. + + Args: + model_config: The configuration related to the LLM model. + cache_config: The configuration related to the KV cache memory + management. + parallel_config: The configuration related to distributed execution. + scheduler_config: The configuration related to the request scheduler. + device_config: The configuration related to the device. + placement_group: Ray placement group for distributed execution. + Required for distributed execution. + log_stats: Whether to log statistics. + """ + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + placement_group: Optional["PlacementGroup"], + log_stats: bool, + ) -> None: + logger.info( + "Initializing an LLM engine with config: " + f"model={model_config.model!r}, " + f"tokenizer={model_config.tokenizer!r}, " + f"tokenizer_mode={model_config.tokenizer_mode}, " + f"revision={model_config.revision}, " + f"tokenizer_revision={model_config.tokenizer_revision}, " + f"trust_remote_code={model_config.trust_remote_code}, " + f"dtype={model_config.dtype}, " + f"max_seq_len={model_config.max_model_len}, " + f"download_dir={model_config.download_dir!r}, " + f"load_format={model_config.load_format}, " + f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " + f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, " + f"quantization={model_config.quantization}, " + f"enforce_eager={model_config.enforce_eager}, " + f"kv_cache_dtype={cache_config.cache_dtype}, " + f"device_config={device_config.device}, " + f"seed={model_config.seed})") + # TODO(woosuk): Print more configs in debug mode. + + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.log_stats = log_stats + self._verify_args() + + self._init_tokenizer() + self.seq_counter = Counter() + + # Create the parallel GPU workers. + if self.parallel_config.worker_use_ray: + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + self._init_workers_ray(placement_group) + else: + self._init_workers() + + # Profile the memory usage and initialize the cache. + self._init_cache() + + # Create the scheduler. + self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) + + # Metric Logging. + if self.log_stats: + self.stat_logger = StatLogger( + local_interval=_LOCAL_LOGGING_INTERVAL_SEC, + labels=dict(model_name=model_config.model)) + self.stat_logger.info("cache_config", self.cache_config) + + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + def get_tokenizer_for_seq(self, sequence: Sequence): + return self.tokenizer.get_lora_tokenizer(sequence.lora_request) + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_workers(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + assert self.parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + + self.workers: List[Worker] = [] + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + self.workers.append(worker) + + self._run_workers( + "init_model", + get_all_outputs=True, + ) + self._run_workers( + "load_model", + get_all_outputs=True, + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + # TODO align + """ + self._run_workers("init_model") + self._run_workers("load_model") + """ + + def _init_tokenizer(self, **tokenizer_init_kwargs): + init_kwargs = dict( + enable_lora=bool(self.lora_config), + max_num_seqs=self.scheduler_config.max_num_seqs, + max_input_length=None, + tokenizer_mode=self.model_config.tokenizer_mode, + trust_remote_code=self.model_config.trust_remote_code, + revision=self.model_config.tokenizer_revision) + init_kwargs.update(tokenizer_init_kwargs) + self.tokenizer: TokenizerGroup = TokenizerGroup( + self.model_config.tokenizer, **init_kwargs) + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + num_gpus = self.cache_config.gpu_memory_utilization + else: + num_gpus = 1 + + self.driver_dummy_worker: RayWorkerVllm = None + self.workers: List[RayWorkerVllm] = [] + + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + # TODO align + self.workers.append(worker) + else: + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + driver_node_id, driver_gpu_ids = ray.get( + self.driver_dummy_worker.get_node_and_gpu_ids.remote()) + worker_node_and_gpu_ids = ray.get( + [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + # TODO align + """ + node_workers[driver_node_id].append(0) + node_gpus[driver_node_id].extend(driver_gpu_ids) + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, + start=1): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + """ + + # Set CUDA_VISIBLE_DEVICES for the driver. + set_cuda_visible_devices(node_gpus[driver_node_id]) + for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): + worker.set_cuda_visible_devices.remote(node_gpus[node_id]) + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + # Initialize torch distributed process group for the workers. + model_config = copy.deepcopy(self.model_config) + parallel_config = copy.deepcopy(self.parallel_config) + scheduler_config = copy.deepcopy(self.scheduler_config) + device_config = copy.deepcopy(self.device_config) + + + for rank, (worker, (node_id, + _)) in enumerate(zip(self.workers, + worker_node_and_gpu_ids)): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: Worker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + )) + self._run_workers( + "init_model", + get_all_outputs=True, + ) + self._run_workers( + "load_model", + get_all_outputs=True, + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + # TODO align + """ + for rank, (worker, (node_id, + _)) in enumerate(zip(self.workers, + worker_node_and_gpu_ids), + start=1): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: Worker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + )) + driver_rank = 0 + driver_local_rank = node_workers[driver_node_id].index(driver_rank) + self.driver_worker = Worker( + model_config, + parallel_config, + scheduler_config, + device_config, + driver_local_rank, + driver_rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + # don't use cupy for eager mode + self._run_workers("init_model", + cupy_port=get_open_port() + if not model_config.enforce_eager else None) + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + """ + + def _verify_args(self) -> None: + self.model_config.verify_with_parallel_config(self.parallel_config) + self.cache_config.verify_with_parallel_config(self.parallel_config) + if self.lora_config: + self.lora_config.verify_with_model_config(self.model_config) + self.lora_config.verify_with_scheduler_config( + self.scheduler_config) + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + from class :class:`~vllm.worker.Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameters. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + get_all_outputs=True, + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + # TODO align + """ + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + """ + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + # FIXME(woosuk): Change to debug log. + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = self.cache_config.block_size * num_gpu_blocks + if self.model_config.max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({self.model_config.max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + @classmethod + def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + # Initialize the cluster. + placement_group = initialize_cluster(parallel_config) + # Create the LLM engine. + engine = cls(*engine_configs, + placement_group, + log_stats=not engine_args.disable_log_stats) + return engine + + def encode_request( + self, + request_id: str, # pylint: disable=unused-argument + prompt: Optional[str], + prompt_token_ids: Optional[List[int]] = None, + lora_request: Optional[LoRARequest] = None, + ): + if prompt_token_ids is None: + assert prompt is not None + prompt_token_ids = self.tokenizer.encode(request_id=request_id, + prompt=prompt, + lora_request=lora_request) + return prompt_token_ids + + def add_request( + self, + request_id: str, + prompt: Optional[str], + sampling_params: SamplingParams, + prompt_token_ids: Optional[List[int]] = None, + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, + ) -> None: + """Add a request to the engine's request pool. + + The request is added to the request pool and will be processed by the + scheduler as `engine.step()` is called. The exact scheduling policy is + determined by the scheduler. + + Args: + request_id: The unique ID of the request. + prompt: The prompt string. Can be None if prompt_token_ids is + provided. + sampling_params: The sampling parameters for text generation. + prompt_token_ids: The token IDs of the prompt. If None, we + use the tokenizer to convert the prompts to token IDs. + arrival_time: The arrival time of the request. If None, we use + the current monotonic time. + prefix_pos: If not None, we use the given position as the prefix + position for each prompt. We will cache the prefix's KV + cache and reuse it for the next request with the same prefix. + This is an experimental feature, and may be replaced with + automatic prefix caching in the future. + + Details: + - Set arrival_time to the current time if it is None. + - Set prompt_token_ids to the encoded prompt if it is None. + - Create `best_of` number of :class:`~vllm.Sequence` objects. + - Create a :class:`~vllm.SequenceGroup` object + from the list of :class:`~vllm.Sequence`. + - Add the :class:`~vllm.SequenceGroup` object to the scheduler. + + Example: + >>> # initialize engine + >>> engine = LLMEngine.from_engine_args(engine_args) + >>> # set request arguments + >>> example_prompt = "Who is the president of the United States?" + >>> sampling_params = SamplingParams(temperature=0.0) + >>> request_id = 0 + >>> + >>> # add the request to the engine + >>> engine.add_request( + >>> str(request_id), + >>> example_prompt, + >>> SamplingParams(temperature=0.0)) + >>> # continue the request processing + >>> ... + """ + if lora_request is not None and not self.lora_config: + raise ValueError(f"Got lora_request {lora_request} but LoRA is " + "not enabled!") + if arrival_time is None: + arrival_time = time.monotonic() + prompt_token_ids = self.encode_request( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + lora_request=lora_request) + + # Create the sequences. + block_size = self.cache_config.block_size + seq_id = next(self.seq_counter) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, + lora_request) + + # Check whether the input specifies prefix + prefix = self.scheduler.prefix_pool.add_or_get_prefix( + prompt_token_ids[:prefix_pos], lora_request.lora_int_id + if lora_request else 0) if prefix_pos is not None else None + + # Defensive copy of SamplingParams, which are used by the sampler, + # this doesn't deep-copy LogitsProcessor objects + sampling_params = sampling_params.clone() + + # Create the sequence group. + seq_group = SequenceGroup(request_id, [seq], sampling_params, + arrival_time, lora_request, prefix) + + # Add the sequence group to the scheduler. + self.scheduler.add_seq_group(seq_group) + + def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: + """Aborts a request(s) with the given ID. + + Args: + request_id: The ID(s) of the request to abort. + + Details: + - Refer to the + :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` + from class :class:`~vllm.core.scheduler.Scheduler`. + + Example: + >>> # initialize engine and add a request with request_id + >>> request_id = str(0) + >>> # abort the request + >>> engine.abort_request(request_id) + """ + self.scheduler.abort_seq_group(request_id) + + def get_model_config(self) -> ModelConfig: + """Gets the model configuration.""" + return self.model_config + + def get_num_unfinished_requests(self) -> int: + """Gets the number of unfinished requests.""" + return self.scheduler.get_num_unfinished_seq_groups() + + def has_unfinished_requests(self) -> bool: + """Returns True if there are unfinished requests.""" + return self.scheduler.has_unfinished_seqs() + + def _check_beam_search_early_stopping( + self, + early_stopping: Union[bool, str], + sampling_params: SamplingParams, + best_running_seq: Sequence, + current_worst_seq: Sequence, + ) -> bool: + assert sampling_params.use_beam_search + length_penalty = sampling_params.length_penalty + if early_stopping is True: + return True + + current_worst_score = (current_worst_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=self.get_tokenizer_for_seq( + current_worst_seq).eos_token_id)) + if early_stopping is False: + highest_attainable_score = (best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=self.get_tokenizer_for_seq( + best_running_seq).eos_token_id)) + else: + assert early_stopping == "never" + if length_penalty > 0.0: + # If length_penalty > 0.0, beam search will prefer longer + # sequences. The highest attainable score calculation is + # based on the longest possible sequence length in this case. + max_possible_length = max( + best_running_seq.get_prompt_len() + + sampling_params.max_tokens, + self.scheduler_config.max_model_len) + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=self.get_tokenizer_for_seq( + best_running_seq).eos_token_id, + seq_len=max_possible_length)) + else: + # Otherwise, beam search will prefer shorter sequences. The + # highest attainable score calculation is based on the current + # sequence length. + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=self.get_tokenizer_for_seq( + best_running_seq).eos_token_id)) + return current_worst_score >= highest_attainable_score + + def _process_sequence_group_outputs(self, seq_group: SequenceGroup, + outputs: SequenceGroupOutput) -> None: + + # Process prompt logprobs + prompt_logprobs = outputs.prompt_logprobs + if prompt_logprobs is not None: + seq_group.prompt_logprobs = prompt_logprobs + + # Process samples + samples = outputs.samples + parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + existing_finished_seqs = seq_group.get_finished_seqs() + parent_child_dict = { + parent_seq.seq_id: [] + for parent_seq in parent_seqs + } + for sample in samples: + parent_child_dict[sample.parent_seq_id].append(sample) + # List of (child, parent) + child_seqs: List[Tuple[Sequence, Sequence]] = [] + + # Process the child samples for each parent sequence + for parent in parent_seqs: + child_samples: List[SequenceOutput] = parent_child_dict[ + parent.seq_id] + if len(child_samples) == 0: + # This parent sequence has no children samples. Remove + # the parent sequence from the sequence group since it will + # not be used in the future iterations. + parent.status = SequenceStatus.FINISHED_ABORTED + seq_group.remove(parent.seq_id) + self.scheduler.free_seq(parent) + continue + # Fork the parent sequence if there are multiple child samples. + for child_sample in child_samples[:-1]: + new_child_seq_id = next(self.seq_counter) + child = parent.fork(new_child_seq_id) + child.append_token_id(child_sample.output_token, + child_sample.logprobs) + child_seqs.append((child, parent)) + # Continue the parent sequence for the last child sample. + # We reuse the parent sequence here to reduce redundant memory + # copies, especially when using non-beam search sampling methods. + last_child_sample = child_samples[-1] + parent.append_token_id(last_child_sample.output_token, + last_child_sample.logprobs) + child_seqs.append((parent, parent)) + + for seq, _ in child_seqs: + self._decode_sequence(seq, seq_group.sampling_params) + self._check_stop(seq, seq_group.sampling_params) + + # Non-beam search case + if not seq_group.sampling_params.use_beam_search: + # For newly created child sequences, add them to the sequence group + # and fork them in block manager if they are not finished. + for seq, parent in child_seqs: + if seq is not parent: + seq_group.add(seq) + if not seq.is_finished(): + self.scheduler.fork_seq(parent, seq) + + # Free the finished and selected parent sequences' memory in block + # manager. Keep them in the sequence group as candidate output. + # NOTE: we need to fork the new sequences before freeing the + # old sequences. + for seq, parent in child_seqs: + if seq is parent and seq.is_finished(): + self.scheduler.free_seq(seq) + return + + # Beam search case + # Select the child sequences to keep in the sequence group. + selected_child_seqs = [] + unselected_child_seqs = [] + beam_width = seq_group.sampling_params.best_of + length_penalty = seq_group.sampling_params.length_penalty + + # Select the newly finished sequences with the highest scores + # to replace existing finished sequences. + # Tuple of (seq, parent, is_new) + existing_finished_seqs = [(seq, None, False) + for seq in existing_finished_seqs] + new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs + if seq.is_finished()] + all_finished_seqs = existing_finished_seqs + new_finished_seqs + # Sort the finished sequences by their scores. + all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), + reverse=True) + for seq, parent, is_new in all_finished_seqs[:beam_width]: + if is_new: + # A newly generated child sequence finishes and has a high + # score, so we will add it into the sequence group. + selected_child_seqs.append((seq, parent)) + for seq, parent, is_new in all_finished_seqs[beam_width:]: + if is_new: + # A newly generated child sequence finishes but has a low + # score, so we will not add it into the sequence group. + # Additionally, if this sequence is a continuation of a + # parent sequence, we will need remove the parent sequence + # from the sequence group. + unselected_child_seqs.append((seq, parent)) + else: + # An existing finished sequence has a low score, so we will + # remove it from the sequence group. + seq_group.remove(seq.seq_id) + + # select the top beam_width sequences from the running + # sequences for the next iteration to continue the beam + # search. + running_child_seqs = [(seq, parent) for seq, parent in child_seqs + if not seq.is_finished()] + # Sort the running sequences by their scores. + running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), + reverse=True) + + # Check if we can stop the beam search. + if len(running_child_seqs) == 0: + # No running sequences, stop the beam search. + stop_beam_search = True + elif len(all_finished_seqs) < beam_width: + # Not enough finished sequences, continue the beam search. + stop_beam_search = False + else: + # Check the early stopping criteria + best_running_seq = running_child_seqs[0][0] + current_worst_seq = all_finished_seqs[beam_width - 1][0] + stop_beam_search = self._check_beam_search_early_stopping( + seq_group.sampling_params.early_stopping, + seq_group.sampling_params, best_running_seq, current_worst_seq) + + if stop_beam_search: + # Stop the beam search and remove all the running sequences from + # the sequence group. + unselected_child_seqs.extend(running_child_seqs) + else: + # Continue the beam search and select the top beam_width sequences + # to continue the beam search. + selected_child_seqs.extend(running_child_seqs[:beam_width]) + # The remaining running sequences will not be used in the next + # iteration. Again, if these sequences are continuations of + # parent sequences, we will need to remove the parent sequences + # from the sequence group. + unselected_child_seqs.extend(running_child_seqs[beam_width:]) + + # For newly created child sequences, add them to the sequence group + # and fork them in block manager if they are not finished. + for seq, parent in selected_child_seqs: + if seq is not parent: + seq_group.add(seq) + if not seq.is_finished(): + self.scheduler.fork_seq(parent, seq) + + # Free the finished and selected parent sequences' memory in block + # manager. Keep them in the sequence group as candidate output. + for seq, parent in selected_child_seqs: + if seq is parent and seq.is_finished(): + self.scheduler.free_seq(seq) + + # Remove the unselected parent sequences from the sequence group and + # free their memory in block manager. + for seq, parent in unselected_child_seqs: + if seq is parent: + # Remove the parent sequence if it is not selected for next + # iteration + seq_group.remove(seq.seq_id) + self.scheduler.free_seq(seq) + + def _process_model_outputs( + self, output: SamplerOutput, + scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + now = time.time() + # Update the scheduled sequence groups with the model outputs. + scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups + for seq_group, outputs in zip(scheduled_seq_groups, output): + self._process_sequence_group_outputs(seq_group, outputs) + + # Free the finished sequence groups. + self.scheduler.free_finished_seq_groups() + + # Create the outputs. + request_outputs: List[RequestOutput] = [] + for seq_group in scheduled_seq_groups: + seq_group.maybe_set_first_token_time(now) + request_output = RequestOutput.from_seq_group(seq_group) + request_outputs.append(request_output) + for seq_group in scheduler_outputs.ignored_seq_groups: + request_output = RequestOutput.from_seq_group(seq_group) + request_outputs.append(request_output) + + # Update prefix state, now all the uncomputed prefixes are computed. + for seq_group in scheduled_seq_groups: + if (seq_group.prefix is not None and seq_group.prefix.allocated + and not seq_group.prefix.computed): + seq_group.prefix.computed = True + + # Log stats. + if self.log_stats: + self.stat_logger.log(self._get_stats(scheduler_outputs)) + + return request_outputs + + def step(self) -> List[RequestOutput]: + """Performs one decoding iteration and returns newly generated results. + + .. figure:: https://i.imgur.com/sv2HssD.png + :alt: Overview of the step function + :align: center + + Overview of the step function. + + Details: + - Step 1: Schedules the sequences to be executed in the next + iteration and the token blocks to be swapped in/out/copy. + + - Depending on the scheduling policy, + sequences may be `preempted/reordered`. + - A Sequence Group (SG) refer to a group of sequences + that are generated from the same prompt. + + - Step 2: Calls the workers to execute the model. + - Step 3: Processes the model output. This mainly includes: + + - Decodes the relevant outputs. + - Updates the scheduled sequence groups with model outputs + based on its `sampling parameters` (`use_beam_search` or not). + - Frees the finished sequence groups. + + - Finally, it creates and returns the newly generated results. + + Example: + >>> # Please see the example/ folder for more detailed examples. + >>> + >>> # initialize engine and request arguments + >>> engine = LLMEngine.from_engine_args(engine_args) + >>> example_inputs = [(0, "What is LLM?", + >>> SamplingParams(temperature=0.0))] + >>> + >>> # Start the engine with an event loop + >>> while True: + >>> if example_inputs: + >>> req_id, prompt, sampling_params = example_inputs.pop(0) + >>> engine.add_request(str(req_id), prompt, sampling_params) + >>> + >>> # continue the request processing + >>> request_outputs = engine.step() + >>> for request_output in request_outputs: + >>> if request_output.finished: + >>> # return or show the request output + >>> + >>> if not (engine.has_unfinished_requests() or example_inputs): + >>> break + """ + seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() + + output = self._run_workers( + "execute_model", + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, + blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, + blocks_to_copy=scheduler_outputs.blocks_to_copy, + ) if not scheduler_outputs.is_empty() else [] + + return self._process_model_outputs(output, scheduler_outputs) + # TODO align + """ + if not scheduler_outputs.is_empty(): + # Execute the model. + all_outputs = self._run_workers( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, + "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, + "blocks_to_copy": scheduler_outputs.blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + else: + output = [] + + return self._process_model_outputs(output, scheduler_outputs) + """ + + def do_log_stats(self) -> None: + """Forced log when no requests active.""" + if self.log_stats: + self.stat_logger.log(self._get_stats(scheduler_outputs=None)) + + def _get_stats(self, + scheduler_outputs: Optional[SchedulerOutputs]) -> Stats: + """Get Stats to be Logged to Prometheus.""" + now = time.monotonic() + + # KV Cache Usage in %. + num_total_gpu = self.cache_config.num_gpu_blocks + num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks() + gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu) + + num_total_cpu = self.cache_config.num_cpu_blocks + cpu_cache_usage = 0. + if num_total_cpu > 0: + num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks( + ) + cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu) + + # Scheduler State + num_running = len(self.scheduler.running) + num_swapped = len(self.scheduler.swapped) + num_waiting = len(self.scheduler.waiting) + + # Iteration stats if we have scheduler output. + num_prompt_tokens = 0 + num_generation_tokens = 0 + time_to_first_tokens = [] + time_per_output_tokens = [] + time_e2e_requests = [] + if scheduler_outputs is not None: + prompt_run = scheduler_outputs.prompt_run + + # Number of Tokens. + if prompt_run: + num_prompt_tokens = sum( + len(seq_group.prompt_token_ids) + for seq_group in scheduler_outputs.scheduled_seq_groups) + num_generation_tokens = sum( + seq_group.num_seqs() + for seq_group in scheduler_outputs.scheduled_seq_groups) + else: + num_generation_tokens = scheduler_outputs.num_batched_tokens + + # Latency Timings. + time_last_iters = [] + for seq_group in scheduler_outputs.scheduled_seq_groups: + # Time since last token. (n.b. updates seq_group.metrics.last_token_time) + time_last_iters.append(seq_group.get_last_latency(now)) + # Time since arrival for all finished requests. + if seq_group.is_finished(): + time_e2e_requests.append(now - + seq_group.metrics.arrival_time) + + time_to_first_tokens = time_last_iters if prompt_run else [] + time_per_output_tokens = [] if prompt_run else time_last_iters + + return Stats( + now=now, + num_running=num_running, + num_swapped=num_swapped, + num_waiting=num_waiting, + gpu_cache_usage=gpu_cache_usage, + cpu_cache_usage=cpu_cache_usage, + num_prompt_tokens=num_prompt_tokens, + num_generation_tokens=num_generation_tokens, + time_to_first_tokens=time_to_first_tokens, + time_per_output_tokens=time_per_output_tokens, + time_e2e_requests=time_e2e_requests, + ) + + def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: + """Decodes the new token for a sequence.""" + (new_tokens, new_output_text, prefix_offset, + read_offset) = detokenize_incrementally( + self.get_tokenizer_for_seq(seq), + all_input_ids=seq.get_token_ids(), + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms.spaces_between_special_tokens, + ) + if seq.tokens is None: + seq.tokens = new_tokens + else: + seq.tokens.extend(new_tokens) + seq.prefix_offset = prefix_offset + seq.read_offset = read_offset + seq.output_text += new_output_text + + def _check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + return + if seq.get_last_token_id() in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + seq.get_last_token_id()) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + return + + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if seq.get_output_len() == sampling_params.max_tokens: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) and seq.get_last_token_id() + == self.get_tokenizer_for_seq(seq).eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> List[int]: + return self._run_workers("list_loras") + + def _run_workers_in_batch( + self, + workers, + method: str, + *args, + **kwargs, + ): + all_outputs = [] + for worker in workers: + if self.parallel_config.worker_use_ray: + executor = partial(worker.execute_method.remote, method) + else: + executor = getattr(worker, method) + + output = executor(*args, **kwargs) + all_outputs.append(output) + if self.parallel_config.worker_use_ray: + all_outputs = ray.get(all_outputs) + return all_outputs + + def _run_workers( + self, + method: str, + *args, + get_all_outputs: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + all_outputs = [] + if max_concurrent_workers: + work_groups = [ + self.workers[i:i + max_concurrent_workers] + for i in range(0, len(self.workers), max_concurrent_workers) + ] + else: + work_groups = [self.workers] + + for workers in work_groups: + all_outputs.extend( + self._run_workers_in_batch(workers, method, *args, **kwargs)) + + if get_all_outputs: + return all_outputs + + # Make sure all workers have the same results. + output = all_outputs[0] + for other_output in all_outputs[1:]: + assert output == other_output + return output + + # TODO align + """ + def _run_workers( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *args, **kwargs) + for worker in self.workers + ] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Start the driver worker after all the ray workers. + driver_worker_output = getattr(self.driver_worker, + method)(*driver_args, **driver_kwargs) + + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import MultiOutputNode, InputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote.bind(input_data) + for worker in self.workers + ]) + return forward_dag.experimental_compile() + """ \ No newline at end of file diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py new file mode 100644 index 0000000..54b09c3 --- /dev/null +++ b/vllm/engine/metrics.py @@ -0,0 +1,225 @@ +from vllm.logger import init_logger +from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics + +import time +import numpy as np +from typing import Dict, List +from dataclasses import dataclass + +logger = init_logger(__name__) + +disable_created_metrics() + +# The begin-* and end* here are used by the documentation generator +# to extract the metrics definitions. + + +# begin-metrics-definitions +class Metrics: + + def __init__(self, labelnames: List[str]): + # Unregister any existing vLLM collectors + for collector in list(REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + REGISTRY.unregister(collector) + + self.info_cache_config = Info( + name='vllm:cache_config', + documentation='information of cache_config') + + # System stats + self.gauge_scheduler_running = Gauge( + name="vllm:num_requests_running", + documentation="Number of requests currently running on GPU.", + labelnames=labelnames) + self.gauge_scheduler_swapped = Gauge( + name="vllm:num_requests_swapped", + documentation="Number of requests swapped to CPU.", + labelnames=labelnames) + self.gauge_scheduler_waiting = Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames) + self.gauge_gpu_cache_usage = Gauge( + name="vllm:gpu_cache_usage_perc", + documentation="GPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames) + self.gauge_cpu_cache_usage = Gauge( + name="vllm:cpu_cache_usage_perc", + documentation="CPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames) + + # Raw stats from last model iteration + self.counter_prompt_tokens = Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames) + self.counter_generation_tokens = Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames) + self.histogram_time_to_first_token = Histogram( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + labelnames=labelnames, + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + ]) + self.histogram_time_per_output_token = Histogram( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + labelnames=labelnames, + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5 + ]) + self.histogram_e2e_request_latency = Histogram( + name="vllm:e2e_request_latency_seconds", + documentation="Histogram of end to end request latency in seconds.", + labelnames=labelnames, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + + # Legacy metrics + self.gauge_avg_prompt_throughput = Gauge( + name="vllm:avg_prompt_throughput_toks_per_s", + documentation="Average prefill throughput in tokens/s.", + labelnames=labelnames, + ) + self.gauge_avg_generation_throughput = Gauge( + name="vllm:avg_generation_throughput_toks_per_s", + documentation="Average generation throughput in tokens/s.", + labelnames=labelnames, + ) + + +# end-metrics-definitions + + +@dataclass +class Stats: + """Created by LLMEngine for use by StatLogger.""" + now: float + + # System stats. + num_running: int + num_waiting: int + num_swapped: int + gpu_cache_usage: float + cpu_cache_usage: float + + # Raw stats from last model iteration. + num_prompt_tokens: int + num_generation_tokens: int + time_to_first_tokens: List[float] + time_per_output_tokens: List[float] + time_e2e_requests: List[float] + + +class StatLogger: + """StatLogger is used LLMEngine to log to Promethus and Stdout.""" + + def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: + # Metadata for logging locally. + self.last_local_log = time.monotonic() + self.local_interval = local_interval + + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + + # Prometheus metrics + self.labels = labels + self.metrics = Metrics(labelnames=list(labels.keys())) + + def info(self, type: str, obj: object) -> None: + if type == "cache_config": + self.metrics.info_cache_config.info(obj.metrics_info()) + + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + return float(np.sum(tracked_stats) / (now - self.last_local_log)) + + def _local_interval_elapsed(self, now: float) -> bool: + elapsed_time = now - self.last_local_log + return elapsed_time > self.local_interval + + def _log_prometheus(self, stats: Stats) -> None: + # Set system stat gauges. + self.metrics.gauge_scheduler_running.labels(**self.labels).set( + stats.num_running) + self.metrics.gauge_scheduler_swapped.labels(**self.labels).set( + stats.num_swapped) + self.metrics.gauge_scheduler_waiting.labels(**self.labels).set( + stats.num_waiting) + self.metrics.gauge_gpu_cache_usage.labels(**self.labels).set( + stats.gpu_cache_usage) + self.metrics.gauge_cpu_cache_usage.labels(**self.labels).set( + stats.cpu_cache_usage) + + # Add to token counters. + self.metrics.counter_prompt_tokens.labels(**self.labels).inc( + stats.num_prompt_tokens) + self.metrics.counter_generation_tokens.labels(**self.labels).inc( + stats.num_generation_tokens) + + # Observe request level latencies in histograms. + for ttft in stats.time_to_first_tokens: + self.metrics.histogram_time_to_first_token.labels( + **self.labels).observe(ttft) + for tpot in stats.time_per_output_tokens: + self.metrics.histogram_time_per_output_token.labels( + **self.labels).observe(tpot) + for e2e in stats.time_e2e_requests: + self.metrics.histogram_e2e_request_latency.labels( + **self.labels).observe(e2e) + + def _log_prometheus_interval(self, prompt_throughput: float, + generation_throughput: float) -> None: + # Logs metrics to prometheus that are computed every logging_interval. + # Support legacy gauge metrics that make throughput calculations on the vLLM side. + # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens + # Which log raw data and calculate summaries using rate() on the grafana/prometheus side. + # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 + self.metrics.gauge_avg_prompt_throughput.labels( + **self.labels).set(prompt_throughput) + self.metrics.gauge_avg_generation_throughput.labels( + **self.labels).set(generation_throughput) + + def log(self, stats: Stats) -> None: + """Called by LLMEngine. + Logs to prometheus and tracked stats every iteration. + Logs to Stdout every self.local_interval seconds.""" + + # Log to prometheus. + self._log_prometheus(stats) + + # Save tracked stats for token counters. + self.num_prompt_tokens.append(stats.num_prompt_tokens) + self.num_generation_tokens.append(stats.num_generation_tokens) + + # Log locally every local_interval seconds. + if self._local_interval_elapsed(stats.now): + + # Compute summary metrics for tracked stats (and log them to promethus if applicable). + prompt_throughput = self._get_throughput(self.num_prompt_tokens, + now=stats.now) + generation_throughput = self._get_throughput( + self.num_generation_tokens, now=stats.now) + self._log_prometheus_interval( + prompt_throughput=prompt_throughput, + generation_throughput=generation_throughput) + + # Log to stdout. + logger.info( + f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " + f"Avg generation throughput: {generation_throughput:.1f} tokens/s, " + f"Running: {stats.num_running} reqs, " + f"Swapped: {stats.num_swapped} reqs, " + f"Pending: {stats.num_waiting} reqs, " + f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, " + f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%") + + # Reset tracked stats for next interval. + self.num_prompt_tokens = [] + self.num_generation_tokens = [] + self.last_local_log = stats.now diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py new file mode 100644 index 0000000..fb4e770 --- /dev/null +++ b/vllm/engine/ray_utils.py @@ -0,0 +1,157 @@ +import pickle + +from typing import Optional, List, Tuple, TYPE_CHECKING + +from vllm.config import ParallelConfig +from vllm.logger import init_logger +from vllm.utils import is_hip, set_cuda_visible_devices, get_ip + +logger = init_logger(__name__) + +try: + import ray + + class RayWorkerVllm: + """Ray wrapper for vllm.worker.Worker, allowing Worker to be + lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" + + def __init__(self, init_cached_hf_modules=False) -> None: + if init_cached_hf_modules: + from transformers.dynamic_module_utils import init_hf_modules + init_hf_modules() + self.worker = None + # Since the compiled DAG runs a main execution + # in a different thread that calls cuda.set_device. + # The flag indicates is set_device is called on + # that thread. + self.compiled_dag_cuda_device_set = False + + def init_worker(self, worker_init_fn): + self.worker = worker_init_fn() + + def __getattr__(self, name): + return getattr(self.worker, name) + + def execute_method(self, method, *args, **kwargs): + executor = getattr(self, method) + return executor(*args, **kwargs) + + def get_node_ip(self) -> str: + return get_ip() + + def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: + node_id = ray.get_runtime_context().get_node_id() + gpu_ids = ray.get_gpu_ids() + return node_id, gpu_ids + + def set_cuda_visible_devices(self, device_ids) -> None: + set_cuda_visible_devices(device_ids) + + def execute_model_compiled_dag_remote(self, ignored): + """Used only when compiled DAG is enabled.""" + import torch + if not self.compiled_dag_cuda_device_set: + torch.cuda.set_device(self.worker.device) + self.compiled_dag_cuda_device_set = True + + output = self.worker.execute_model() + output = pickle.dumps(output) + return output + +except ImportError as e: + logger.warning(f"Failed to import Ray with {e!r}. " + "For distributed inference, please install Ray with " + "`pip install ray`.") + ray = None + RayWorkerVllm = None + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + + +def initialize_cluster( + parallel_config: ParallelConfig, + engine_use_ray: bool = False, + ray_address: Optional[str] = None, +) -> Optional["PlacementGroup"]: + """Initialize the distributed cluster probably with Ray. + + Args: + parallel_config: The configurations for parallel execution. + engine_use_ray: Whether to use Ray for async engine. + ray_address: The address of the Ray cluster. If None, uses + the default Ray cluster address. + + Returns: + An optional `PlacementGroup`. It includes the specification + of the resources for each distributed worker. None if Ray is + not used. + """ + if parallel_config.worker_use_ray or engine_use_ray: + if ray is None: + raise ImportError( + "Ray is not installed. Please install Ray to use distributed " + "serving.") + import os + enable_head_ray = os.environ.get("ENABLE_HEAD_RAY",None) + if enable_head_ray is None: + if is_hip(): + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init() + # TODO align + """ + # Connect to a ray cluster. + if is_hip(): + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) + """ + + if not parallel_config.worker_use_ray: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + return None + + # Create placement group for worker processes + current_placement_group = ray.util.get_current_placement_group() + if current_placement_group: + # We are in a placement group + bundles = current_placement_group.bundle_specs + # Verify that we can use the placement group. + gpu_bundles = 0 + for bundle in bundles: + bundle_gpus = bundle.get("GPU", 0) + if bundle_gpus > 1: + raise ValueError( + "Placement group bundle cannot have more than 1 GPU.") + if bundle_gpus: + gpu_bundles += 1 + if parallel_config.world_size > gpu_bundles: + raise ValueError( + "The number of required GPUs exceeds the total number of " + "available GPUs in the placement group.") + else: + num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) + if parallel_config.world_size > num_gpus_in_cluster: + raise ValueError( + "The number of required GPUs exceeds the total number of " + "available GPUs in the cluster.") + # Create a new placement group + placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size) + current_placement_group = ray.util.placement_group( + placement_group_specs) + # Wait until PG is ready - this will block until all + # requested resources are available, and will timeout + # if they cannot be provisioned. + ray.get(current_placement_group.ready(), timeout=1800) + + return current_placement_group diff --git a/vllm/entrypoints/__init__.py b/vllm/entrypoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8db1b96f36dcca317977a87a07cdff0aa99d4d47 GIT binary patch literal 161 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!HEera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_erjGxQDs4XW?o6Letdjp eUS>&ryk0@&Ee@O9{FKt1R6CIA#Y{kgg#iH5EGAX} literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc b/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64e1dae0f4c51c3b71589c31bfd7aeefd43b1322 GIT binary patch literal 3505 zcmZ`+%Wo9P8Sm=p>6vMd#}ACnn_dDelf`4mZpaEDvPf%!H!L7bcB7V}rae_N({{h8 zsxf#pmLMSQa&V-aaq z=8kum+0PAco|@hQo8LV`C-w-fGpA$HN!oZ}Jty8#;E&NM;HOzd(=0keXEp6OJ3$X^ znEcO-?4H~+yi@Gd`e{1HTx!z8J0)Uhc%FSk7v6!7yhlI!9(+v0$LWb3?4LDu&#+T_ zM9JR$5pst581EEJ$`@7%fjVEL;1pce}F#=R9+FuuvS3#cy`b<-r;ayuz^DT~uYNFK;A>4Fw? znVWXp8)7SIFDKnFVQ!F6SA=mEF*jozi{c<@Gk1+8?Oq)44bgH|q>`MZ_#~1g6dIK9 zjHk5UmSLJGtu6QF2e^ywW9D+!PU9HMz@@B6``o>ovE;^z3ql6d)}XuCLCg6T7ID0h zNVUdps@BNV4ccv%N%!7;;l?QqJ6jsB9MB0PcG1NK0>RvNFG#S-hnrCpFR?`Otqc;R zSW2NX2$zB^^hdg~t%ulY{)6Ee85+CBp0R@6-7>bxknExl?U9#;o|P!sGq+7Ww`dvjk~Ha52@>kP|#q_B_}a)LT^VuzRpF;7&=OOb2>N&_ly!036C0z zn65%ql(MHPPN!+haa=qP-eO$QOZyQ;al3)5GT2Sm)>&J+k3-qhOFL;4rH@r-vr07RdxUdYIrgTBhf76-O263*xboD>x57IEETz3 zSaB2SK96+blNj1~_n#7GD&r2B$hiBMJTrB?&mi8$Ed%j3hQ_)viWhPZ@iNwlju%eA zAwj%MY3`PW$VzH07`rMD$v24pqtX?~Kospe2a+6`3{@7YB7RU-b)n)nm2H9)CF{n5 z@q68Jl^@qX2P{-=+lE8z z{|-JImt5F-k|j;LGdE_7NcR5;_@r-ONa@KQb)k{1mity^+R2+kUDxqX_2HX<`$H zSlCjvbcEP%2IDkjZ#Qsoargs@hfiVk|MfpVqBQFksM-}jt_E$5#wpGGG=|%Fg$mrf zF*xy|L*H|p7TBEPXRshIWogFMWs9rsrkthD;=DSR!j;&(ytXBo;3^fpsZjVd6zw!g zP_T^VX>Lw@Hw@lK6X3}+pkb4VS(1_ET8CQ(28-^XkVc&juG z{IscP=0EEdo6M0aeq71OtG*uxVdDD(dvURs;dA z7lnuxH`tcC2Miho*k)X6cw%vpr>Vp_C3}OPk4|z|p|E>2Jv67p4YEw#aMoBa#5E;A zs{v&V*9i*FzP>x5k389GmiQ&u$S*5HtsaXqq&r*dcbhgq*qU3)u)G$fT|Z)*Eb?Y$ z7&FN5HyF$OAPP4b&Rh`z)&}Qw1&qyYw(^Sa)3ojTc{Si&1PGTEoia9jfWE9V!>e>~ zCBXs9RjcBvadbK(F^ z-4lH<`vo&s{{Xx>&GKFn>1_5KO)A8x%=TsOIVnfaL4mBz8!^% zCeZQ~cQLmV#+&Tm6!XUn-bAq}&EZXJ+B=xMwx1+uQvHN~28+4sJ^W{CP%X7+61pqw zw}!lW4foT2#ICDLjQBMM%c$B_6vMQs61{G!!l)C=w9&b$D28eo_{&u?J^ER?Ty)9& W%PCihV>+a&xeJPgbAO8bRa?GXx literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/__pycache__/llm.cpython-310.pyc b/vllm/entrypoints/__pycache__/llm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..395ffaeaa9fc797303f4f3ccb822ada05178f37b GIT binary patch literal 8819 zcmb_h&5s<%b?@%!`P|u0E-6#8?BYhYIP!9qw4B6X^hHx5C0JStA|(S81iigeH8V~2 z$FRC*m%Eq+0#l~|$~Bi9NWd5U2RY;q$RR!(=m5F+Ugw6{8$q+zRQ0Og3YL0Hs$TQk!E$d|)$9IBu-aQy z^@hI|toPPcz3FcRXL@H$xF{fiSCKeI~Fp1>IscE+&ICwrMIK^uDvJjPlS4G zy%7#E_L>~V=xo0I_B&Yv9+vf6wyTRD^!r{BFbOU<7em$CmiI9EbwtQW{ z!)v(X1Jf{s(KETxv$)x-G_7W(X*a7VE%mE4>&*sQnoXN`c}v)QNmPY>-r#N2mqm@Q zh&o?IeNDHo>;Db?e+K=VqQTGN{~X?b3jg20|9Nf=7=M~S^I^5unik*T7cdui6X1i!pEtnY7TVVE|3~;=#jj4S zcz#a3ThChbyWasn-$na6`p=_h15{S{^ZX)sJ418v%e;erTKAS%(=-*uVjR&$i{i7* zU*O-v*iBB9bBZ?K=9j?zS#H6GuA~*H(;Y7x#O#H+f`-0tk_x-p73Sae`t~0 zidZsRqZSxQm*ka<^xC%BBqmw9A%u|#-}mr-lFf#8MJy2gksEq(pw=Y@1F!GVOL}#I z8$wh8TC{+7Z{2z0>YSKK(uoK;)}BOcUtm%#hG;Orjuhx02r0t8=&~EoauiZ)CKn+O zzzH!b81cP#Ej|kS#ZKQVbQveYV}U5qh(63hijfFo2$T8HAqmoz!wGR6Z?jl{ z>K`Wijhws<`8cg#o{$JbC+$olO)<=ho0{j5*z?ZXPguZB>p_e<-qlPci$J89dE^c zbf^iq{-Jvm7j&p+MyJG=oWzMEVf=}3`Wb7~#Ih#ng9~<1bcfyTZ2Oj^*I|cH)ErMd zIt=~Dxtyk} zY3R`a+YA z8`*BThxfO(+17BJ+}~0%-HH)ih+}|Zu%)EB8->^mV13|8STTUPe{XIDN(N;oYp)0cpgz5kASg%%ZUpEKZv_a1yCR&NH)`w*cbK&kfVzau zfrs^K;3kC^yBZMk#QBRC*w^q-EP_au`5|Zj2YG-;;Cj`l(HgzbOFY*HoG4jgfgFKH zn1PCRvRDz}NgjL5HqCH6aWZ%508l`rWX%-KMC{_eNDc*pn9E9uS2`M~axa@ia|@Cp zhF|;&hygUr@Fl~EJfi5bH-SPz&JjCseT37Qk3dKyg{xg{K8g{FD!!7@1kN1pG{WR$ zGitv*AJ5TCMlzZVM~J#H-2zmoxyZbeC}Jb<0uVtc;U2+w2neD{aw!^Inxm%!jf=N# zz0+Yr%7|h)QY&5Yp*wa|gr)UGtX{n7{YXjh`YZ|?1NYG@Xm&BzhSEJDm| zS%C~GH4y=@*p(i_z!*51dm|mU=`AF)lgYe1$fJ?4JPEwUxy6YFdxepd0#t+~2u~1~ zfYSsa7rUb12ac>*EwPu+ktuBCLK8XJm#+Z)kD`g%iLpBn#H@g`r+5$xzdXOtTVP5a zdNI(A)DZ`mifL;&rIN4Jf(Rj_zHkIWL3x$^WCVz&RYXkpFx4}2scL0+!N#yk1aWawc`|r39ykHUl%~U@_Kk7gyV8{MZX%!{IB~?OBA@C?B zt3mL~MzG2fjUl3rvOSO*Ge<;<+IJV-5Ts4=&qBY#X0$AwK0>f4a4;0W$|rsBeH(!< zorWo%xvpH68nv6+6iD3o1gBp^Qt_*w|07-h^6Doxo<1>{VWd^H_tRzp$LUH@RS{{r zo-9U!>1Gy1%?D^{Rr#p2UG|}ABZvQVV@9`Vvk9Wpb5BOrv^9f;ba~D=)2F{$k?Hxb zlvP@j0=f?SYgm7NZJ8#34%qh=OagK_osQ3mI=c#y_if5?!CKb<8@hvL8O$DRk0u@^*J~lcv*`z1R zIQ7=(SnUXvTycO%aoR#Sq#GoTkLTr|pfldECZ1w3`#@(@x1Rs?&WPcT6&! z8h4Gm=3Vo)@v*u4u_eEQHl@3cDW9i$u`b!cW4b&P2|z~Y#KvffEfg*5c%3Gl>+Mrd zeo>unHS-E8us}7pgEZD{Lw*lUavMd*$|hC@v^YE5#%%TkZ>!l?>}>YA-aBbsr(V+4 zJoZr=Jc!e^tS2MocpQd=v-rS!;2^1zHsD?z$^@ouAI=rHq!N4mI_Ar-HYyF=YB~Rm z%GBiM#E|W2CHu!FvmTm1H4}4cCe}ksZcfdid4ebOoARQ15C2JJYCbekRvw!7DtIg*q=5r%EqGfR(3%bjE^w%Bl1VL3w~66T1ZAfJ!83vU6#(ZSp*^#|4Eph4Y}< zwdbeqgi^H#>uyj)gf0qa7$Pfpf8>GsaI(Y{c_#!!2t;sP&XRHkK@5+`%=J{mrY1|(omvPH{cSgrUesI z^GhVPsgNHrGJ$(T9>k3ZOysNJ4sJKvODz zNSd9v)}}4WZ)zK*qbrFW-UY5sf2~jtP?T z(F$@DGR-_wF`TTIIP1bo{{Rk0eji1rk+wddgYp}608Lv+i~tS^^U?}nF|A-0Nxmwr zA~!G;Y1?r*&pleYeMzf56_Qe5cpJA0NfQc6p(Y&^ zf&$edU3!=#5ir_!_t~N9^<=GTAPfK%@k?1 z???*g{bQ8?yfpj%XPF%Izir$XQ=Gd z(6xJK7dvCObB>NDltOi!wB^u=dg4>P?Kn6m`?{y@I6Ue*j@m-90xGG!A4R@=ovKy! zc|g*>lw`(|wyq?jmY1lYBab9SD(I55k>wHTK`pRS^sG*S--K`)~~H) z>p!d&>))+a3ui6sQ){F3uht6?`j?j70v)T;`U2=xY-Ibh4NZLo)HgKsnxfwNt#uu+ z_Bmh;tu{ Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/generate") +async def generate(request: Request) -> Response: + """Generate completion for the request. + + The request should be a JSON object with the following fields: + - prompt: the prompt to use for the generation. + - stream: whether to stream the results or not. + - other fields: the sampling parameters (See `SamplingParams` for details). + """ + request_dict = await request.json() + prompt = request_dict.pop("prompt") + prefix_pos = request_dict.pop("prefix_pos", None) + stream = request_dict.pop("stream", False) + sampling_params = SamplingParams(**request_dict) + request_id = random_uuid() + + results_generator = engine.generate(prompt, + sampling_params, + request_id, + prefix_pos=prefix_pos) + + # Streaming case + async def stream_results() -> AsyncGenerator[bytes, None]: + async for request_output in results_generator: + prompt = request_output.prompt + text_outputs = [ + prompt + output.text for output in request_output.outputs + ] + ret = {"text": text_outputs} + yield (json.dumps(ret) + "\0").encode("utf-8") + + if stream: + return StreamingResponse(stream_results()) + + # Non-streaming case + final_output = None + async for request_output in results_generator: + if await request.is_disconnected(): + # Abort the request if the client disconnects. + await engine.abort(request_id) + return Response(status_code=499) + final_output = request_output + + assert final_output is not None + prompt = final_output.prompt + text_outputs = [prompt + output.text for output in final_output.outputs] + ret = {"text": text_outputs} + return JSONResponse(ret) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default=None) + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--ssl-keyfile", type=str, default=None) + parser.add_argument("--ssl-certfile", type=str, default=None) + parser.add_argument( + "--root-path", + type=str, + default=None, + help="FastAPI root_path when app is behind a path based routing proxy") + parser = AsyncEngineArgs.add_cli_args(parser) + args = parser.parse_args() + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = AsyncLLMEngine.from_engine_args(engine_args) + + app.root_path = args.root_path + uvicorn.run(app, + host=args.host, + port=args.port, + log_level="debug", + timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py new file mode 100644 index 0000000..fc82018 --- /dev/null +++ b/vllm/entrypoints/llm.py @@ -0,0 +1,220 @@ +from typing import List, Optional, Union + +from tqdm import tqdm +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + +from vllm.lora.request import LoRARequest +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.llm_engine import LLMEngine +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.utils import Counter + + +class LLM: + """An LLM for generating texts from given prompts and sampling parameters. + + This class includes a tokenizer, a language model (possibly distributed + across multiple GPUs), and GPU memory space allocated for intermediate + states (aka KV cache). Given a batch of prompts and sampling parameters, + this class generates texts from the model, using an intelligent batching + mechanism and efficient memory management. + + NOTE: This class is intended to be used for offline inference. For online + serving, use the `AsyncLLMEngine` class instead. + NOTE: For the comprehensive list of arguments, see `EngineArgs`. + + Args: + model: The name or path of a HuggingFace Transformers model. + tokenizer: The name or path of a HuggingFace Transformers tokenizer. + tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer + if available, and "slow" will always use the slow tokenizer. + trust_remote_code: Trust remote code (e.g., from HuggingFace) when + downloading the model and tokenizer. + tensor_parallel_size: The number of GPUs to use for distributed + execution with tensor parallelism. + dtype: The data type for the model weights and activations. Currently, + we support `float32`, `float16`, and `bfloat16`. If `auto`, we use + the `torch_dtype` attribute specified in the model config file. + However, if the `torch_dtype` in the config is `float32`, we will + use `float16` instead. + quantization: The method used to quantize the model weights. Currently, + we support "awq", "gptq" and "squeezellm". If None, we first check + the `quantization_config` attribute in the model config file. If + that is None, we assume the model weights are not quantized and use + `dtype` to determine the data type of the weights. + revision: The specific model version to use. It can be a branch name, + a tag name, or a commit id. + tokenizer_revision: The specific tokenizer version to use. It can be a + branch name, a tag name, or a commit id. + seed: The seed to initialize the random number generator for sampling. + gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to + reserve for the model weights, activations, and KV cache. Higher + values will increase the KV cache size and thus improve the model's + throughput. However, if the value is too high, it may cause out-of- + memory (OOM) errors. + swap_space: The size (GiB) of CPU memory per GPU to use as swap space. + This can be used for temporarily storing the states of the requests + when their `best_of` sampling parameters are larger than 1. If all + requests will have `best_of=1`, you can safely set this to 0. + Otherwise, too small values may cause out-of-memory (OOM) errors. + enforce_eager: Whether to enforce eager execution. If True, we will + disable CUDA graph and always execute the model in eager mode. + If False, we will use CUDA graph and eager execution in hybrid. + max_context_len_to_capture: Maximum context len covered by CUDA graphs. + When a sequence has context length larger than this, we fall back + to eager mode. + disable_custom_all_reduce: See ParallelConfig + """ + + def __init__( + self, + model: str, + tokenizer: Optional[str] = None, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + tensor_parallel_size: int = 1, + dtype: str = "auto", + quantization: Optional[str] = None, + revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + seed: int = 0, + gpu_memory_utilization: float = 0.9, + swap_space: int = 4, + enforce_eager: bool = False, + max_context_len_to_capture: int = 8192, + disable_custom_all_reduce: bool = False, + **kwargs, + ) -> None: + if "disable_log_stats" not in kwargs: + kwargs["disable_log_stats"] = True + engine_args = EngineArgs( + model=model, + tokenizer=tokenizer, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + tensor_parallel_size=tensor_parallel_size, + dtype=dtype, + quantization=quantization, + revision=revision, + tokenizer_revision=tokenizer_revision, + seed=seed, + gpu_memory_utilization=gpu_memory_utilization, + swap_space=swap_space, + enforce_eager=enforce_eager, + max_context_len_to_capture=max_context_len_to_capture, + disable_custom_all_reduce=disable_custom_all_reduce, + **kwargs, + ) + self.llm_engine = LLMEngine.from_engine_args(engine_args) + self.request_counter = Counter() + + def get_tokenizer( + self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + return self.llm_engine.tokenizer.tokenizer + + def set_tokenizer( + self, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + ) -> None: + self.llm_engine.tokenizer.tokenizer = tokenizer + + def generate( + self, + prompts: Optional[Union[str, List[str]]] = None, + sampling_params: Optional[SamplingParams] = None, + prompt_token_ids: Optional[List[List[int]]] = None, + prefix_pos: Optional[Union[int, List[int]]] = None, + use_tqdm: bool = True, + lora_request: Optional[LoRARequest] = None, + ) -> List[RequestOutput]: + """Generates the completions for the input prompts. + + NOTE: This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: A list of prompts to generate completions for. + sampling_params: The sampling parameters for text generation. If + None, we use the default sampling parameters. + prompt_token_ids: A list of token IDs for the prompts. If None, we + use the tokenizer to convert the prompts to token IDs. + prefix_pos: If not None, we use the given position as the prefix + position for each prompt. We will cache the prefix's KV + cache and reuse it for the next request with the same prefix. + This is an experimental feature, and may be replaced with + automatic prefix caching in the future. + use_tqdm: Whether to use tqdm to display the progress bar. + lora_request: LoRA request to use for generation, if any. + + Returns: + A list of `RequestOutput` objects containing the generated + completions in the same order as the input prompts. + """ + if prompts is None and prompt_token_ids is None: + raise ValueError("Either prompts or prompt_token_ids must be " + "provided.") + if isinstance(prompts, str): + # Convert a single prompt to a list. + prompts = [prompts] + if (prompts is not None and prompt_token_ids is not None + and len(prompts) != len(prompt_token_ids)): + raise ValueError("The lengths of prompts and prompt_token_ids " + "must be the same.") + if sampling_params is None: + # Use default sampling params. + sampling_params = SamplingParams() + + # Add requests to the engine. + num_requests = len(prompts) if prompts is not None else len( + prompt_token_ids) + for i in range(num_requests): + prompt = prompts[i] if prompts is not None else None + prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None + token_ids = None if prompt_token_ids is None else prompt_token_ids[ + i] + self._add_request(prompt, + sampling_params, + token_ids, + lora_request=lora_request, + prefix_pos=prefix_pos_i) + return self._run_engine(use_tqdm) + + def _add_request( + self, + prompt: Optional[str], + sampling_params: SamplingParams, + prompt_token_ids: Optional[List[int]], + lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, + ) -> None: + request_id = str(next(self.request_counter)) + self.llm_engine.add_request(request_id, + prompt, + sampling_params, + prompt_token_ids, + lora_request=lora_request, + prefix_pos=prefix_pos) + + def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: + # Initialize tqdm. + if use_tqdm: + num_requests = self.llm_engine.get_num_unfinished_requests() + pbar = tqdm(total=num_requests, desc="Processed prompts") + # Run the engine. + outputs: List[RequestOutput] = [] + while self.llm_engine.has_unfinished_requests(): + step_outputs = self.llm_engine.step() + for output in step_outputs: + if output.finished: + outputs.append(output) + if use_tqdm: + pbar.update(1) + if use_tqdm: + pbar.close() + # Sort the outputs by request ID. + # This is necessary because some requests may be finished earlier than + # its previous requests. + outputs = sorted(outputs, key=lambda x: int(x.request_id)) + return outputs diff --git a/vllm/entrypoints/openai/__init__.py b/vllm/entrypoints/openai/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..191feb366c2dae05a6a44567615a8071249ee409 GIT binary patch literal 168 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hwera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_erjGxQDs4XW?o6Lettn} lUSg(xe0*kJW=VX!UP0w84x8Nkl+v73JCG&COhAH#0RRTHDF^@n literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10bba0504874b11756cd6ed5bd06cbffaaf7114f GIT binary patch literal 7136 zcmbVQ%aasGdatUku73ACcq2dsLZAk^Tj;rxfn~%)j1~sO0Fr9amWs}tnVP9blT|&4 zjv)f=a3w+x9~_SL+A|UMfxE;1fG>O5*ux$iQCFWh`((*l{QI)1dU}u)8{zKAtjsSn zzx?uhXNHYNRl)Dz-~Z?0zr3a>|3#hgUj?1>DADxrbRzr>zy{g}|nzC(p6aJ(% zDcdD)%AdBTWxMR{@ON4}{aw~Bf48+;!YbY#f3LMywyWMgf4{X~wrk!i{sHTNf6zMU zAF>WfSlxToH!V}P8{T36h;>A^o8D{wQR}E|Pk68U$E;&$Pg=*BYQ4da@D`go11?x6 z_))Z{&nVpFC)ZW$6xZ1fUR`c4Xly6j^+X4(Lw(+1yPv97_hW_aVSAq_Y%f=q-&`+P zr`bNX|A}I~#VhNI^)~Nryu)7MZ*OX?%I0P-Pw9gt)EAUn>R3p#s)wdCx-kz*(1*nw>9Bs(R?&hV40y`p`q zihts7fu6IxyL^sM^RvspL^+Q#gYpsUJS~F0;_?O1)7@0RRr^iAFY1bQ32P5whs*3u zxkD3lo@Q@J>>p#!<%`Of?^>7n<&7)sZT8Nlc2~1ztxx!`xUq4S&u*&xlXcCy#!DM> z{2E~v=Yj1UwN>R?_5NXuT+c?X=lk$$>-p#n_AbBtJB7XX`@*{VTa};M_>{fR&;CyN zy+#;7!A+(P2(L?RYd$F8d&`>VdpBxDtgzm9z;a2B}$xZyi1+;*Zx*LM1S0IFAR-MTR!JMkbwuXK~&AMhwfb9dIf<9IG} zVmAyf2@w*G=7sAw=dZbpdHjJRc&m`qKbgNicaul`Fo<|EH6IJ^_-?Q`0+sU?mQTzO zWtW0QH{dg3k+yA0(ABHgQYbJ^T?qZY$7$P)j%3$`B`5wdxFN|J!(Vd)H@3ZSF+K6r z^*#?~X6LzB1x17d8}8W-9PNc6{c3n~=6MlpOm>7=>bQ24uC;quoa}oUI9i+(z|x*d z63BiUzHn>G1vH{#XndnFm1$3uM{2Art5Gx7HVSL%x4NORyKfYQ=P?6Jx-eu{SncnJ=WAowW(@<86NQ51!#{Gbk4{v zqjR0D1)#r{(rY_V7i)^Sk#oaE+87LP}qI0Mgl}E+30zB8$BV*0jfE{nDceO{QN9DERh8k;YrM2?9^e^}I zxUiDqt*D#S4XFdu|jbI;8F=IxR``z7sF~h^@8_W$|qL z9Xc5aSXavs?==3yeJ?Ymlh1Cca}aRZwh6OsLsM)&WCMuTPU^OOf8coOh(;_ z#~>9}Wi?iR3f{{ZjkvgMu;!x5CdOt>j-XRx(CHC$#~5^{+;!I&WVeLu8H4PVkbQB9 z?Psq%HL|@ANa(?_d4~Xb6>)p(1c&9EBV&-)B;@EAEk$Qz3aYq1j)iR1D{ zWepCB__R^oB=>Z3Zg^r9-r7vPXq%ieA_Q#Gym@K<*22IuXKu`z!C5;LE`m{%s3(RK zavf)81{LL2i%QJOgH1uyvMPCM>r@ba|5|1Mcg4)NJg_Z?1p=7V(4)B7O={`WUx7i zwiZG(9TOx3VQfZy-g6h+oQ|;-k{0YM$pcE$iPCKci{RDb0=$Mq-5EC9ZE_Q|JK;%U@ zpU2xC@)MKDq$=e@&aITSL+l8t$fTz&LXTS&CyL-Kogf~br_BkN3ILl69uGudx(nu4 zBx88wj>DL}$OE48#)eY({rD^?@K>E-1A>h_(il3!@bFKbW`6#vR1!G_lAskmE@Ig~ z^0P~5ie$~OhBJg=jDQ|54S#vriDEKc0@;#l50-dfBB+?q#(R9p4VdYq{JiIoQi*U7 zgI}c3Uq2ky+iic87>d^+8|Ngv?F?fs#T5sRVgBP;b88Ms1=u~Q<|H0;I_78i_IkV@ zn|`ho%Z%He1%$^6S|39!2Qc{31#6a27QyrIjO^Pm+ zyH!idIJ6Ah$d z+r|&NkwMdu|1m20h>?vLKN!UkEW?3yIX2b!Up*VAS;qI(dRdi}4$ED*W#RUs9%bM8 zuYd-n2mgVRrO=cDAiYx8kq96S&?vdu&^HwssI{k>%)=49N=T9-|#TvB^H34w@fXbwwV1O_i&N{rOs^Z?xU zBTh&z)Y13`_mDIVqSFo=SL9mwNZwoNJpTh`h|8#;8I;hCQ8&uI5+mL3A22QxFzs=* zAa0;HtaevVcje)uZB$S)Hll(ubfRKI%`Y#HaFs!-fy)fZ1$1e1V0}qmWrG;@L2OTU z!Q3C%&Z^^j@-mR_BRMXvU;+eZWU!a?c^;$xjFPdgsWnwWu;EUkm%>B0H@eeBNhx#I zNliEpAYbXZe^$5tn@*mdXH8jCH?{Oa2JO<;pkeEp*cmc-$> zyr$7zf*|=ErV|IA73ZF5;wCVQkBQJJy3=?0Dm-8t){eV2?mILLUE|DkY)YoYrvS7n zDYtfKN=m1L_U2dXr0T-KQNTh#m>~S5v66p@PD-8_DSkoMmS+WVn}+Y8N-A003SZV) z@oNnKD@wEVz$QLNg)`9wESbvB zG+Hh0(wyz*5ML7PH&oFXgiX~;8sPx;Zzws%pz$Xurg?N#@f|m??UrseWVUGM`D2=N z+G9y)J|oK%^KP2EMOOU|vS=Ft*Z$R|FOc*itZdUP@?aycm1-7r1%7J-?+BY}G_j_x zDeKu2fYhVH2JWMH94o}dJ<3V}T4m6s##NcYN>bOht!S0!h6RWqY0M0e+zNLnv$SF9 zBaW9HVJ1cC){^?|z{w^+!TuH&EWQA3tx{4L2v2m0F-X515l0VPIG<8>!lbasNmBlV+0i5_?J-&EPTVOe0^yS9)23?YIk}VJ zj@dxE%JI}Bf;f&(e_|v zo;4l2K1ZTsui*8>hD%=Mhy{^{zD&-NI(72At94JD1x@l*LVQG2R*S-5V>V+O;HMnYX+wv4u=!1#3E+ z&>2OENKk07_T&RwX+{SKB2l8zseCk#TT$u(q^Xe0w|3D3PbYnU>3~}{(mgLqGl>J+ z#^_bg#$-F4EGWKMW;SK?KGaFyhdMZD9QHy_G%!sxsUpL*YH~sPdM8Nk*3q9sK{*=I zGx;nhrfBMkzdCixS4`7_V}ETyb{2Y0v8Hd$Ub}St_AMCer5pCl)!93jQtvup+fIP% zRC>d)ZRwxHX{smb*U>7{g1Y^%9FzqIn(ZSJu|k7yi0b1yXls(0w2N+N4@7PvQZ#jyJkG-sm)~s_HK$^*VMov?_X7 ZmHvOwbVTnLcqEm5%D8Ey`knsA{{wcZ)+Yb} literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bacf5491b6f22e9ee5f0c847008af6e8aa8d0ca GIT binary patch literal 11959 zcmcIq>607Bb;leS9E%08CwIx^E_jIqk6qGo>_m~JEm5YUSXwI*vW`7rhDa6T;7c zpY*1{PYFK@e%hM>KO_7h@Uz|_@P~w-13%}@gP#|E9{geN2>2ty9|phR9R+_>_#@yK zy(REV!Y_a?dB?yX6aFap@b;2-l&gFh{N3H%wa48AP#|l(M)tGyS{T{S*Ml%hJzo!%t~R3d=h|W2Zy}a^rG-e@iZahs1NDOMDWqMg ztA-a9n&c{PRU37$8v6R45sm3;%k!I+PN(jbjVS+ewb^deTbnOdb+sAP*yiq^1!fgb z@C=AbxQ3T-%~v27uBB|1^vov{5?bBRelp=DT}P#qJ)c0ngY-1Q6w@)PbTs~Ht$qEH z3fg`vP*GBmjU0ddy22<^O%(*yO%>VUR$Hw_$u|2H*)Wzs{MO^kmoBaZnDPtlYV8L46|B6~Xf#(;E7V(UAC&|vep|Jw^_8~v zL%-%XmfKr;95v?g1UZmwB4?!WFB&;Rat@y4zee%FBrD4^JQHGJy<@nTY~^_I^rj@v zskOXRzeHMPP$G?q&bV1MhIkh7F*hgioW#c^J}zbQ63OcSHKjOPJo#WTXhoy1oYUpMix zxwf4ZQElVx3z{ZsurzV(2-2eTlZ|R~-K#!zh}E#ZTw;4}sNLmWAxL|{QybMzBdlyx zYmon~$gF#AQJSi-t+=NmGW)t~q56Jz!$fF&!}w<64pj4=v3AeY^X!lv)|+ZnSM?&Y zSwt_fT&kv3HB{b}XsX(1_%|ytUs0{idP`l6@^Na=RmL9Q@HZh=>p?W#%~w@jyH=^U zyn0OqQ8rF}tFCTF6LEY4)m6ey3mP36XQQ02H><7swrBvfX;b^1b~IKGD(evS8z@zt z(bMea49P4>n&dbMXIamYP^a{15*gDO#3Fm$_Zzy*#Iq#lNY0aBV@c>0l8Yo%IDLtv zuWo}i)eFdAy8$z^iCJg~pWXi=bVtgKT%%RKybZEjbCm5FkWAA}Di|t{<_X&4C813Y zvyj<0u z%w3ceA$z!F=@aO+K1p(lWYqLfjyT;SN86MAR{Bz|OU&ikxHfH5l=g47lvi2bT8nJ$ z`(fn3hQeS)*@48DCx%yjXstyE1`GaUY~vFovY4MlEV3Keko1o;k*XpKG;#=!BbVa~ ziXhoUy1T+$)x(Bn?!oIy0UE69o^1!4j95n6y{v3DVjZy>PIoo9W9pwk4YB-|5Q}VV zgw?h3*zov8!UvNe$;N z;laJa9rC%>hOYtA645wpM6(@MLjQ(p1yL@NSCD6xDJxgW<9aH2TmoH1=^rtTXw>sZATya9tf7_?{|bdQ)Yz+$n5s~5coNFq0i5Q&$|Gf-F_SvezDIV5gs74o4>GYq$Q*#S+9)fC`vB6 zfT!J(OTr)T^CjT{SG)Pgq~?=-{y6xg3;5a{@d@yg?qhxaq?8Ao?Y@6X%Ae`yKL$SI z0^Z8(pH=5JFay%F(~@%@Iq{sF5&i;rStBV^M%r1mykUChyz_T$cST+FE_loD80wO@ z;$6IpUHOjTe$2~2_b){=m#Rq( z^$#e6-*=vllA+(OwC{_48LhP_+pON|sgxZk<%!vi>;UDiL=Ng#fT`*Jh7BcbS9)Bx zTC`tmYZa(gO#yuXyN6rR#D-=ETeYozQWo#k!^(QS8bo#{P&!JjV+0j{@*ZPAW+k|6+?dy8rx1wB^(P~rOipIMfS_Axv9Idcsts2*; z_oJPr5oQ>)kn_ul>j|9&HG*ttdXSSZ!iBkQAjQ%rJTtUIU|P=#o$D#jCf#tpl-Mvl zxGr~dx6K`M)3|MfX>YL(tAn>P?;2hjVHRNq;TS^JpF=o?a2#O{VIJW)!UDoP!Xm-~ z!U=>$gp&v-5KbYSM3~U8f*LLK4k=TJPwRC_IVUO8h|eHr2H`BiS%ilW9zr;Wa1P-- z!g+*;5gtZ(1mO{c3kVkw9z}R`$9UKHGEgkyA9YwbQ;sDBNLbT6r2L_A8uziS4O z1z`oL-jbd0o~;!!^#+KW?)B5Psg-anU;`Q(0~8w_8R#t4z8(hD4(L+g%a9&$pPaOc z=B!~G+b@hXQOluU>3IMZ@p}WHJbI+TQ_ByjUteKI*GN7`@*2rAB+rt(PV!SE&yoBb z$!ADD3lbH&-~sg`wjSu0n01xp6_U@B{4B|k-9GcEQWiPu3Sd%EuIaYy(Lb6}Lb%I#9&my~`0hK#TM0kAV|WIPXQG2+ z3Q3tl3SgY1Knqc30V&wnnX+l!F=$Xd`-ZJghcK};#-f$Zny#W|y-rdi`83G(8?XAE zQmxu5`K`uQ30oW@ssyjqEBPCxcsD4ayS|4lu5{+~uC3$r`BIOS&Es^**Zq`u136t@ zzGvc~sMFLPw06&mtXd<`#QSc%3sq`<2M0X^{3~*Cq;^f!ZdAIB^_q{%{t8}xFb!f_ zlLr6J57SB0Fb(@X+uFV~dO;piU-|K2s6KE}xGbgkkETul1`o=gge)xjSjVir9Bcp5wm?Rhrz zS7WT(^K3FKjr^WxliB6jq}*|ok3F6Ej3N&{F3&THoLc}tIE}F2&&G1kgO@p$6{dGn_Rii-xEItiy&3p0EA(Z$KFmeWf)BG2O&s9A zIPq@129Kk&u~ENuzX)-R^_%v2*RD?kDtqabSiwcNMJr3p5%m|7@B-NZcn~cgDE-UK z9fby{^jCLbf`fE@px}FedJomnhy|-%pzaf3V^6tZ-X=&l!I}CPK)Mw|fdJBz*KvqP zx(B3NI~G8?8K%6YE=W%U+!CY{tP`XYtP`XYtP`XYtP`XYtP`XYtP`XYtP`XYtP`XY ztP`XYtP`XYtP3KhgKx)A54)&tKz9TS?Ln|-Do0C)_f-W1WY?N>D z?VBV6DEeW#In=ok{VLmnLUEV`P*S;B$8OgNclp={!c)zi@^A12Uj`|D1ZdRBncsDQ zFLQaI(miOjcn}&rAbtDfqlk*$;zVzQ9Edmd4vX9-`2~`xI@vQAPX=_4xrOJq4+yQXk3k6%K@@8SvAM>`gwozMR9mb=mbvzZ}Eg}0|ymRK6dweE^*|;(724$nb;$Dkb~i6BDb_0!27g7A7^NvL24WB#qNU0FsL4cI#1mRew1? zc1yzj!eN}v30o7F0hghK6F9<=qr7Mwmyq?~S_Ma9Tb5ew}<$RpCb zDCT7%KC58fzKLd?t9bOUgFvPi_s`tRa&8|>w*F0){VkAKto6I(MI3*Ju~Az3hseN3 zP6_=LmLPXvFCMg!`~M__mNzEXTRAnZ^3Y_*;Qzf1e|w zmW>?vA~L3X!mxd8FtdBx7$KSZtE{sCvabVNu*=LnL2{Dh6v?QWX9s3`9T-TX{u*Kub?2MNm_m2<2`8?CnKyE*tH=mAklkXz-41qz zdz;*ZViE0Vj3-Ag^gfN!enUa+{8or-eS9G{Jh;Ib(WtKeF&bPzSG%{>)F92Y1H0JM zpM~M$tKIuTtUX24Gps%P@7f0&0Nag^osx__!UphXD0d9Kk2Q|m!Tlzjfl;0v-S2(x z?U8>P=1dK<{s_jo%>FE)vF)YNO^r~4?E))b{-_P~xn5U>R6=CRf}@4YWN#As38`N;#mbrT+6?Z~H)c9eToHNt9F0oL@Npbq_~ zAm#M1>`Rsi(?APYa-`P&1u`z7-(6MU37}@$fxYjk!tv4Tyl*>+lKB1--m>7@q10;r6M$^R%~} zQCcxGq!r7xYn|4Ok7m2zF_Wn2kZ!E$KjM%++(yA^f0T{luTbt&?8vwH0=e>W{TKpx z8C$%nk6i+)ON@d=mgzCM>GqaaZH4t(?Ba>rkzq!b-+e82!aA-m#+WcI2}5^@@tKAk zaq#=p__>`_1JiEXkG;nY;R^C@F@KL{pSRM_S6FC|TnulOejX9B@9s zo&m{44N_E=F6R(BDOaj;a5=S`n@X2lQaLBNrE*LqsX645lmCO1MRN0dJ-`yXr0q&( zYaY6LUU$EK{rdgh>rEOBTfuYcum5A~Z(dWB|E9|9W1#XD-t=EVn8MUham!!TRZ-VM zEz(_Gr8Ye@qKaFIOxKL6ZZ)!8E2_CQ`OOIJsP5Kfxe_{2!)?g288)LicTSe8VJn(< z=c5I8AzE}7qa}AKT6UMC6?Y|CbyuTf?y+diU6Z_4csx4co{;5QcrrTWo{CPpr)Av^ z&qQsvtt!0Ezp1 zFjUJ?-+%vuOYv3^^G>Z;zIemWE+)|+=klouzT-G1>#5_Ea-+_S%4 z%>?(O3D1!c%5NX>$@p~|;RBwg{uVEsw|SWP(^}(_5Q#Y8tX(6X%W*Hkx*8L?uMD%n zFzaYVkK}pF+7r^(z7SFsb=z=qk@ovKKk(3o=sW%Ww7ZWFnqs`neN#VlT9HEuH- zhT!l9c-5Y^rtoG@2d^oO!f@v#pUsV)!5XZ2-*8*9WscADg*An>(C^3yG)m86^K1d! ziwE;O-J>ZvZMMjkz`4Yh*~)#zUH-9RDDDc3@K`Z_NM*DSn}x}xofJ+AMM7s$!gyFT z#ALfXk%ZW$FcCg|8iqXWnlqZCas{uBH~kh!rtBzvHCJ;jhYB9xwB~xQcv`OP>Um{P zrBef8w9Sq#*1#{06FCXeF^Wn$9B@%so+oYG6DQDKEMotB=5-;jmvPv!pYe`a7%2~X zGh-019H?j@MB79fr{ck2A47NP;*^HxgK!A8e4O$f{_T^})6{jfq$AresBysi~T3J~s8H zIy(N8!q=xuE2^UO1ayG!c+-k0cs3Mqm-IP(Xjt-`|BN^NfvPB9sL-a8t9$BYc**ya zv7YNp9jan3*RLw0MvfimYq{Fj8Bv4j5A?C|sS-bLC|PCuO;s6JRAsc9nY&eHWL93; zQGcfVT%+B=$hER%PyJLemCrE4xbi2JtCQb5>K%nuGJ8j%9%FNK7PHl{-v%=uXyao1Ev^j?7+_1xTpj!;{bHcEA)lnGW@^{+v_ zPkd$j!iSgJX%GdWFWOnsF1Pc=l-k#AyFGcXa^7y=$g)9tZexSYvzLfyy|k9~ZW3*< zq?>N|gW#;-JuZ0MO1|+82icvu5FT`wnljq#Y+&8%J%Bh)j;uOTsX@1DkI5Aka|o za|bff63@1K!?;U=csMj+``g!W1k!eTBN>LQ9T2ia7#>4G$m`oQWQ4!S z=&sXE;@jvd7gbnUa+Alwh!@K~W8PMYgft5rbu`FE>k<#zLEQESgK(SVBek`sPN3Zb zUwd+N&b3FCE<`yC5HQODI>%fGr<8YZPTX2;I-_tG<&cM+N?}DME~VnPaMp_2rMq1& zQMfQmye%r6Aga*QOccxI_Ia`iC%6PSQA1bvly}AJUQ^ z8Ti7FQg_aa{O#);dLVQczF5UT(!aS4YMXWz?)*_aOA8ar;V~=hPyBGmrQQl_qETL$ z5EuAdNcDNJ$Vn{VmF-j2V_~OB!LuB_VHiI7-f={mq#{o*UIcOH%GH-zhVv3Hp|wzX zVNdqO3s_O=9VM*t=(0;}nJ~MjXQ7)v1>8joY959__-WWAULa8<`6kW0GTD?P>v#^| zQU;>kYdMx`sG4RH#cMxm8iwkq#{O~5x~DyID)8Ur!}l%C2A{23``W&t+i0bBW8b*f z)Q+p*-M4i)#$yvBU_5YHs=Yt@_S1g<*=$lyeWvUn^z6Xf;^-k}__v)YDhj6@_m@E=C*fy zVnr3UgdpaK9kB{&_q=lJt6P8jy26&pwqy{yvN>9&lSCLl@u=(F8^ic!M-^|v`YECf zV#eY1-{1jPTrO@Ce&E$HYceA2eItzw^%(~K|BE#~J z0c{O_w;K)__l8K;vXKdh@d16BZ5N7o1Cydt5@!^)N)c4?K1rgB3-}S=CGs{knBo$Z z-y!lYk@tv@>k*fUAP!RA=@{bo@LM=pG9X`E%Dx&8BQKG87(nUC2X~R8_P`GjrQglG zq}PLlU+P7pMB@^`Qg{A|wx&%OKcj9G52SEj-@~M?GhJz%#C+5&7y84uuWVlW+_pO> zq?m<{z`QWBAmT+WN)Xp1anKcSknurvC$?-7_n|V%0|eM46N&srk?VDic}* zmmfNnmTCfdEs43nXjKD3+nW5P@?#*n_N2f0NHeH+*&ieP(y6voM|XhwE!9ROV4?2< zn9O|}C{E+h_+<~^|3jm?f;gZFehr*cnKUWWM5?bH6JzPr$5at3Jfg3}iOAsLVNTmD zBMEcANfE|BX^-vij$VBlagW3>Pep58%Dkr!MT=FuGE4lkGD{@Lt;|5K30GdtbwJ6P z11Pa_ZP(aSkOKi)#7lsJ%C4CkGPgvF%m}LtnVg0);t*?Gz14hO0W`ybroa6?5N%=LaVo6Bwd zwgC1TTznnbp#cb*%WKFA=O`<@^`-=@Jyrn(s{JMLQvd^gvlT%4PpQOAK>8}H0@7#e z79iaQq#v{~T;gC1lGFi+wg4B$`fIBMmjsx{ytP|@l^w@gP5^Wkvy-gG?E6?Xt!+m| z+fsH4fVo630H}_&EN7>YdCKr4UuF)1la-tB&BC2&n^Cd2zh$RD7$pYTTdSfA$7bdy%a`%fpqtCYpiAz zL_%~K8v)fJn!1Te{}4P<6x^z5>o1ENR6i0m*3MJJeRm+)Z4kLYq($U&B2vGvQRx#R6iSOviBNbfJ|hwkAxJINMt%#~ z;4qSIUC>ocXLZJ@Q;zf`MizfeRUgEyl@1a9RBTbBT&diCss1OHAtAX-EIp9IEZw~8 zNR>{k!<{F;J&QD{yF^XX3~-Wqrh;@?SO>Z4;awKLi?9EQH@y!s^~5cB(gnQ8ecWCWh>aJU5LfeI}J<{cFP-z5IW* ztwR|V9Ro5|(qZ~{AUT3w+>^aIIz@+psXN+VG{+h|pt^@c($_rwQ}{sxe>=FjjICFc z^u!(b9s2tWFK(sgaVuqPzgo6sIuffh7*o#9o`~u`bN8fzfMxJ$JtZi;~R_Sh;;wsN8Y;SU-jk4o;w}yUL zei1ea`3eyNSV3vApbK7k(<|y!A}1>dZp8u-GBkPk#qU$;Eg}YyB_b5y$g?GvSTrWL z(Or(K6>mlh&k-NpVu)iP)HNkPWGNahm5r})DCnp>)^zQWZ8RThj%-0|69>rHw;wOO zPMio{hzsw6Za>mSQ!d#`ebL%_SI*ISp>(E0OygF2h!b&3DHm6mh%T{b<w%={tqSSl)1KS8D>G4mbYLhWkdNNhIOC*-Q}T=S2jpByX^_f!7cmr-V`R8 zCEX+}^-YeoblEbPr#Qa!+cK<|d$33%IC8jYv`#8?zxAtTnBL=0#95{>X`9{WWsfol z=XZH`h=0$m&t6QCUMSuYBO+w!QkyedDIF_qWnrDiK={AO8?fvWk1ZItqn^Y|f8S9l LoP{}SOZxu;y|}&S literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7b547ad8a6a458785330e4486cc67639daa0d79 GIT binary patch literal 8424 zcma)BO_1Eibp{$33GA-QWzisCXwDOGeim4mD7a%Jg5Dm?m-Tysy3Iiw0zIpyLUa>>uQds5>E^CZow&Z4aevfonqH?%x=jkbuGv0mYuS!qX(64)v3yKA*gjHoC%pW zf_it-nUv{bFx8!Qrn?QN(VcN-y0gw~cg~sX&O7tn1!qClF@po$gU&&jE(M3Whn>US zBhC?-w}QvIwqvUbuky#YRp$wA@}moi^CaFSZt>s(Q_(%BR zZOvKo)fJ`r*niS4EjQJ)_&(nl@F;E;)7op%X0P=I?{VS9p-8JTe>Tpu<|!`-yh{O3 z^>dvlPR;ZExD)ogAk|NIT5(#qFz5#yBFmj#Cw7Byb(IUs3a{sfU3W0(_{e6xoICf< zncixr$Ihvr`Ezwi>F5(Em?^L_quuczLpvPeV)rS z&xNb+ittjDmOhBQRerYD4l%}RK92K)xIc)SEUljJ^WJM`SGc&+>8)Y`!uMZ8y(d?B z?5<*7+;@H63VjsP#5-~1_C?s@QG|6YifoB#C9b-oOkMD35TN;Zy{B3jbIeHq8efXs zb}zN^>TbtRD_Obk#+@!tt#0V^!0mZRmwSUQb=d94QECVt4@9q3qAkvU3wT%XoMJeO z4#>yMS69_USyviLVm6dv$!8DcEnj;mUrw~=m7l3w)(beRZz>-uAF*L&SWVO&^^S5@ z^2CXRC52xqzMfQmsr*XYRx`e;#B5u+p(fRJ?X+_7x5HXeTSuL&_c>)afmA_U1T}ni znITnOFAnRV=5P|UG@SAaNjaG~sr25Eb#LT#b#a6E3rci@+CZ)ad9o$aWu#?$q$~ZO ztSZRwNqT~6$JL}n?YCi#{u1X8ZP6~Vh?jFFskQhu(f#}$S`4Q!j*71gRPj&A6uqP0 zyruZYwu;&6$z&qS_w`Y{tq-Sr3@xXrck273f^zh^t&;SU3evShO%CvV`E=k<2`$%@pdP&bA{)AQ%_5Ac$xR0 z`%zliFD9KMg%9QFboUoH(`0)(Uf{<3TI`1HcEsbmdRpl8e10v}iJO|E?$T--)uT0> zNiPa}=#b%<74u`37UT&-URs&xp5c43cf!8MEbFe8R!k$CpwYqccNC<>+x31{d8&t zbKPf@VKCETwhEDWhQ#R{gN14Eu8qcJWW!AB`-dx6bRy@BRzn=4Q5_}HBw1R(d*VBk zog`8x-b7|}{6V+xu4Bo>6ctT_G#A7?rODDbX4rGROQDFx0pcAba)`)bB1eck2IAEE zf?w%`gGg>ycsqb-)mjQCPqtFf}p#^d$HK;hn-#=ErswJUS}!F+=jcSB8&aa^x$6B zjikHU^98g{(cgox>-BP7t*f)Frp>E{YBA(AgBg@o=hX_Upwv=lwJB9+#;&F6yG&)f zmR48s#$QLNd0#JDtcErj50s5k`hB*m-`C6R|7w*AYoNB3*Fql;SRt>M*Vk35&rJP) z^(yYeaY>Q6)=%*n;&nZ07-+}n*5DsTm9kN(Slvv$n;(W{!GQDK< z8?olYW!zAy)Q2gKb<{YTwHmTSO<0?)DC5!sN?-8xk<3TRN&efT+EO01cspycPoCx% zq`A)JH;v`jrS;l+0+_^$#-_^lVw)JT9rh#>&i2CAg-jvGcFnV~l|TW{+MW>JrX99( zMzoKUDfY8=DE4YeUY3oTMrw7UPA`hR9^j200B(s_v4pAhLoXQcGeQ7XXkexq^sK3g zm#J`c#9=nF9@1hqx=8AAM&6*B(|u1wJVy*}I}}|nzJY>>j=QdrYE@KItwDKo)hyRO z)1f-1X@4Q}1@-`HwQFkPT@p5t9T}V`_mUU4)>;O*qaR%WOgvHxjzK?Gm7#$>E#ff~ z<+O5fb684B(skC~B9Ks8SB7wm>*}x!3XlOxu;G0I8|%6_4qEV4Xw31%0+c8NK2(PF z-s7_LOkS#sGjONXbptLRbqE|Fg_`6Bk*fDfREjc!8WY5Y>sQx{(wTn*AX9_mr96XU zoRltmGA_kd0_P5gOt7NtYjEdNiLp~jrtYY!vT+Cw+e)l$_4~@On3!?Z*ZsmxC7Z#v zD*5Jy8P^h&(gVk9K>jowuLi(CE`QoD-Y&q6n#)oRvAIXFewhJ#ic(L5Jap0k=j2I? z@Q%pfevA`z(HzoAQrFQdx#kUkjuqvuy4++^m84W4TekR|pw0QZij+?v?eWnf0G&8M>C#A`ZURfNA65peq{s^R57q3wIPl=HBiW5Zs zgvd!Ea?;XZAEWF9kynY3il&CwhiUb31_oW}enbuIW<$J3Riq>OCZ|I#*O}Y{%Oe-l zAXW~<0p)YYQyMGw-dLS-?tutNWD7Um9;ZB74=SUvJLL>Jxa8i&AJf>#qwG70PyInl z`3p3Pd=TA~ClIHT96;s{h+{}6K&Li$4P$d{KF|#YcR^+x_3l@U3WJkSE4xf1o;32t z$QdC2rT7pXweXC8BLyph6VaKE6+-==YIKLlOn5{2RK2C%(%Q^tzIN06l-*)&RjR)k zYh>h4ElN=z5-jb+_GX6%K5X|miOyrX=i0|Q#O2*)F1kbDmAaEqlMFvHQr2jcV8DlD z60@3Uo^|(ni`IDb(~zLG{u?~eiy$9kot{^=>Z7~YkbR==;EL=iz z%#b)+zJN=rMpsx(oQ9m#9E~N_<=9zjW@`X+SwI|M!1w8<303z8EiQgb*8U!d}}J+Kzt&a%i`vL87;i_!xlZ6PQ_1`h?NBhz{b-Mz`W2rbq_(;X0xeYwP+a z%1`sVb9Q$U??YU(kZOpuqy-Lyv6*U!F^E40KQ&f8(ooFOp=w%cxU>wB_%X8LBO-e< zH*sx%apdB^!H?+fPn$;&u>Ms;`@P#YYTeYcz6cb*5ji-yBQxK)=>LxDQ60oA)>w^M1(>m~>IK6P>tH^#{KzXMBKSpbf^daG zi4H5Ne%WUTxoKSYb%fLmUZfyh+qk8`6cv2qX4NtMA}{$Sx7xa2@+}0)%C{6>+_IicpnzDyYX0OQOC2ucY+%5_7$Zlp@}S5i#8=<#%d{ML-O2N?osSt1v9( zVI`hSD&&7I9z(uLa8Rs6?i8fgeEl{XPV|c7e1Y;w5t7jJIwBwdY82Blk}Act018V4 z&?MaZ=tHHqpeu1>^KVr^N>$ma(wyU&1byF8KU56mM|AC}|3!Tze_vIvD)DSm{3%7I z)b+XWA3*169{t}SwLrPcOmYuYD84i${y9D<)rQ7%2*6WWKP-Ww9v`83AwE+g>F~rd zk67MxG6`>S4Bi4OCL?v#VIyHXk4c*Zp7W6u_W5J(jAq-YGLmhhyztm zDjWa$q7vIk)1Krb*fxeUGz&zX9!q9+9tS2+d@Ft;p2?%xOdh?mU%6Qs&L*?u>ysI% zebw858QnWz_G|B+g;Vc#0#DfR@R_PU6A?dm?&RQCIJej0xF4NZTIvU0OgERs?2^3* zBfsQ_t!T;Xca96*hP%h*X^Cuq7F&*%GT$M6GD$;g>4s(BectM{J1vi*(2E(8AbuBP zYGr;s8K zoJ@>#q4LJU2)N*wu`HbF%b=zVC+f>$301{&LAK!?wxF6Y4 z@eT;Wkrc=;bpdkYu-9ohQ*r1<9+scJBjAqibt7lO?RuM+IIbu3O+|Qu?6GtNPL+7_ zveMb&!kI)Q@1ZXbk@$e>5@OG`Zw)EuuzSS|I=GRH0|0=^V=<{Yx(A83sogmu+%UBz|{t%B>Cj!T$H5jR}Q%w2|{ibYp4S)<14j4pQw!R+d2Wq_I}so;Tx z`^bM@s~u3y>ulH1@e~@s=D6*Br5C6VDMi+m*SybbX_)E( zX6QQGYW~6JA_J;bLMZeB%5_uA&2bRY71!MuctKVo2zzE~RHPK0LqUj0!W2=Z6hMU{ zu7jl29M80P5R5R5Tw_61XpO~XkT3B>q${Sv3Od43h)H7JmR5UUm`t78m9?7BN$@JJ zp`uf4BX$ShAUjKqc(ac~CUs8`SQCWSvcbt=nv^A+>69s~y9f=&=S)GjAjguNLEH%< zL8sX{BpDe*T+FUSXyaxKxd4uxR~ZL~M}7+m8HcSf$X0^F7+J7`&~O%cpQ039nVgCg zGK%<9Y|VN4+vJXd&iNo_NC+*vb5yp-0{Q$JZw>IRcX9twp>>f)=3`V(jeO>0v7E|< z%|5?CQT4N@qqNWtLN9hEGG_l={w>vM5aX2mm^tFT-pf=#S_l17i22ghDYH$|6p)RP zyS8WF_aJR*p3G48D>zO{M1a{;Oq_%#@z5XdQXOxTnTTOS11Jto$XDt`O=tHJ$%f9e nd#uR5K&8)FiT%#9tj}gSo5x9!^$-$yfL8nJT3?u_8b|&g5COg7 literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14486f67fc46f6882b5107e065709f7cb8dec94b GIT binary patch literal 5855 zcma)ATW{RP6()z5<#Km*wIs`zn2CcX8z(F0QY38+*N*MfE*#mh?4)7iGW5q|NTD;pB~Y)|59V`&qU)KO8PGVu5mrmeEPCTZ!@3iR5v1{ZThCFn~~MFeY;)q zOKrz@+GW42`mM-qSNw{q+flV$^J}VJiiX<5{;;Y$(MWsLA8n8MW9@N&ynV<&)SmDs z+LQidd&-|ubIQ@-_Ow5(Yoa2Ktm*zy!G57}mscKXyt2m7a}4b&uc2LAWBzfj&uR6c zuR*;wTi5e453{ftg=vb0d->+g>vLI{b&u>eUY~#S5)i)Jys3g{0ky1R=U1?sWz5 zPFz?Fv+s7-E=ifl8zSu_aVm1>YO-)$CU?@j{9(dHbRm>{P*v0otKREEwp(dBpzdmN z~`DCmUQ z;%wc@ogkp@AjsVyXeYcIQN0=j_qt&;7%|dJ=EFe{#&H7Hg1j^co@(+a_EzAVk7l~5 zoKcu(qSl?6&PukJ#BVp=JUx@PGVw+yY~BqQL^`t^MeP|8XL6;JwBjtCNjf49TQjMU z%dL1J5Q@`{&Waod@-aH04d7}HtLm!nJFKW+{2Wcx^j%gEBZM`rO>6Ty*M-fQD9xMP z;N~OCceusvN19*eCE?C9?(p&>)320W{44xLUd7t#G_*}&F+@#N#n3z>Z1m?t=%?{v zQ3ZzKd6$pyQQ#fnV`6k^3>e2ni626l5TlqeKJV~JK7|>F&@#k0sABvupT@2es%KL5 z9N|aNGevapm-sPpSWJr}v@bu7RV931P!wTr)Jp}Gz!AZsK!De*pCLz7}YG2B`N9u5~pzw4kBI;wslUd!%Is z`oYY{?4q_tvl+JlO-{3{o}JlGO6%IH`H2=+Of7R(-qax-GWS*UQ!Uo-Ykkb^>heP0 zd{SQ5)Yt86Pb%x$x=!=vG~lT2@2T6fO~`oXM@CRWnL|lm0w5x9*FIyb`YOwCfF&JA zVM~l_kBvXGdkp;1>qcIV5*aGB_baLjO$=JxA6AWB`=}d|G*BKQFadzItZ_QmTe(S7 zcMU)u!)WfFiQs9Pa1m!uqMeQa*qUvz4O3^jshgYjR`0ca;%Nkdaza7ljM_Ini^?{V zuj#OGOjeqOGLx!RlVtJo1VCO>^vc2!!Lx|C+ZHm+L|$u$4+=>Kk%;s1j#tY|A^Egc zl8=zLlifHbvx}0XlUHwaD@`eYHps&CF8o9%Nx?7p&fEY3I1w`y z(flr9epL}xc$3t3#WE~j?QxJ}@IJ-y5vU3g+NP~L_}k32oNsN%))_3niXOH#VvMmd z)|-4j2luH413uF0uL$ou0AD~$m$WDPx^@NHby-`5YAlh4Gw8CuWTIFoHrF2;g(gE; z*leBULp#h<9L+~F*~RVGv6XsMesk8fvrOLk09nZHGWF0%pGJ)3!5Fzr%0Cgtc z!p9#kjB_BW!J-|vsAhXD0N~FM~D>$lnMXpX7BI;X6(wwmLluQcpqEHVu&qk zgztTTFZ4|Nqqd||$=I5<0B`x&SkPDXKal5S+<4?*%pfygWqtjL*=J;+8JGq0r$}t| z-kV!Y?t9^K*owkCk?<4?(uQ}VwXm3ZF~;r-FNvcSZ!uh^+F(lKDeS4mb5kZs=DVGp z=(mq%5+zwV1%y3Tz6#1AFrpkHRFvyXcKZ{!^U|t=XoLW`TJD!Q zL)XM2=HJx>RwbHkitS5HALTZY733XSgtbS0|J*A>h=JYF2d!kBUOvj9}yrT@my!BAY^=l(JKF%=L~`nxh=dAYyT3$SPuWFe9Zj01=B-x1jEC z?bniE5CnEku^X{=kkl)Z>_(m=@G1f2_b#HA+scooxgBk_wt=>V*DXo00!ARg*-_I_8qk)h4#@j6Peq_F0fyXC z&4czea`4nD*LV-eZHGLtt&=}?bXPZ3n5ybS^k+=Bw;X0NgfFAIX*2s7eD)@LXqvqj zzMrlB@cSVjr2R!CF11CPB74pqA!U%QbVOc3Mu01hiayP~xg6C=gOmptL4}+oY{ytq zj=hL!hWrpCxv5Z@DuOXtqb_SXjJg7Tlbn#6C1*+vKdRFizCuYa10Z(mOR$6`vfMJ|hSV-HQS_=DbXk*=Y6PBu zQ$*MPsd4j5gIk%ogmnKg-GoZ}^GDi&Xx=Si!}CHzuX1y75DmT0d!wIwJP|2EX6A_pNCO*wZ8ph1NV+js zx&U(~(d`PQZs(Q?6-9ibcp~XIlFpTvXRqB1KDc&k_TnJn$V-E4@xU=~;cM;^`a7&m zDd}YotB~O&3UUpV!rGg?Bme(|`@~2TP`NO;5Ab6DDoOZr8}YK;$#S>&l7HT3vc>&& zz^A`qMW3Zk-$L(rxxnJy>$MvO}rLIOh15?eKq$eIXOl*ZEK zN|UEx99G{t49*8;U-eIs8CiEtc?H;z=q_1gCbXh&O_9l9KA=rK_?Eszz{HLwjK`Vn z&0K0_i$Z$ENxWpcMrZPniF70G-4Qgr+~R^adhcALtdA1DWd4V*dgbO~D=o6fSMUuV zwA$UahxkUp!Gm{S$QC`!=y_fdQySjo3ReG5ZvpP2Q$1^dt+WJ_CUTj zaHr|(UhnF5ABDi;U8mmgu16wF1<>$rQ>Z0_J#mP=2#5i@3mZu9enq;Z!g0A3Q)UFl z$SdgdUAmDK)T&znChw}CPs@Hy;4_+AT-O35SO@j;cmN08LM|zSfi{IPFVN!6>MvNF z5?05en8e6pacRPLi;BZs#HbO(CKsdT7Gp>Y4dl05W~q1TfF^(6K6PWZ$PQtdv-M+g zk|>`daF_rov?K?v-fN^wwP^xx6F5zPB9lA|kQ=yrONv^#H6JBmR^$&?X`Gx#UU~og z#o)%Jk8WL>yQyMS;od%>31b94B|!HBNqMED*C2VDz^+NFn#%5vV}NK(e*@rXNQpNU zcbNOkIWgq2amBN5ZN{EW8f?=x*#@)Nx6F3GVLyTw++Y}C7(r;?V5YkRSS091nqHYF6_8%)`G zjcX+8HLd~O%93Uh72H&OrSzbXjk;1P0n%;CA01FA!^U=OQF0-r$n62;@;s^J j&KY=eJUpJI7mk#k0vwI*{D)EK|FT}S9L9_hea!kFC%exr literal 0 HcmV?d00001 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py new file mode 100644 index 0000000..b2f0401 --- /dev/null +++ b/vllm/entrypoints/openai/api_server.py @@ -0,0 +1,251 @@ +import argparse +import asyncio +import json +from contextlib import asynccontextmanager +import os +import importlib +import inspect + +from prometheus_client import make_asgi_app +import fastapi +import uvicorn +from http import HTTPStatus +from fastapi import Request +from fastapi.exceptions import RequestValidationError +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, StreamingResponse, Response + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse +from vllm.logger import init_logger +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_engine import LoRA + +TIMEOUT_KEEP_ALIVE = 5 # seconds + +openai_serving_chat: OpenAIServingChat = None +openai_serving_completion: OpenAIServingCompletion = None +logger = init_logger(__name__) + + +@asynccontextmanager +async def lifespan(app: fastapi.FastAPI): + + async def _force_log(): + while True: + await asyncio.sleep(10) + await engine.do_log_stats() + + if not engine_args.disable_log_stats: + asyncio.create_task(_force_log()) + + yield + + +app = fastapi.FastAPI(lifespan=lifespan) + + +class LoRAParserAction(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + lora_list = [] + for item in values: + name, path = item.split('=') + lora_list.append(LoRA(name, path)) + setattr(namespace, self.dest, lora_list) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="vLLM OpenAI-Compatible RESTful API server.") + parser.add_argument("--host", type=str, default=None, help="host name") + parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument("--allow-credentials", + action="store_true", + help="allow credentials") + parser.add_argument("--allowed-origins", + type=json.loads, + default=["*"], + help="allowed origins") + parser.add_argument("--allowed-methods", + type=json.loads, + default=["*"], + help="allowed methods") + parser.add_argument("--allowed-headers", + type=json.loads, + default=["*"], + help="allowed headers") + parser.add_argument( + "--api-key", + type=str, + default=None, + help= + "If provided, the server will require this key to be presented in the header." + ) + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. If not " + "specified, the model name will be the same as " + "the huggingface name.") + parser.add_argument( + "--lora-modules", + type=str, + default=None, + nargs='+', + action=LoRAParserAction, + help= + "LoRA module configurations in the format name=path. Multiple modules can be specified." + ) + parser.add_argument("--chat-template", + type=str, + default=None, + help="The file path to the chat template, " + "or the template in single-line form " + "for the specified model") + parser.add_argument("--response-role", + type=str, + default="assistant", + help="The role name to return if " + "`request.add_generation_prompt=true`.") + parser.add_argument("--ssl-keyfile", + type=str, + default=None, + help="The file path to the SSL key file") + parser.add_argument("--ssl-certfile", + type=str, + default=None, + help="The file path to the SSL cert file") + parser.add_argument( + "--root-path", + type=str, + default=None, + help="FastAPI root_path when app is behind a path based routing proxy") + parser.add_argument( + "--middleware", + type=str, + action="append", + default=[], + help="Additional ASGI middleware to apply to the app. " + "We accept multiple --middleware arguments. " + "The value should be an import path. " + "If a function is provided, vLLM will add it to the server using @app.middleware('http'). " + "If a class is provided, vLLM will add it to the server using app.add_middleware(). " + ) + + parser = AsyncEngineArgs.add_cli_args(parser) + return parser.parse_args() + + +# Add prometheus asgi middleware to route /metrics requests +metrics_app = make_asgi_app() +app.mount("/metrics", metrics_app) + + +@app.exception_handler(RequestValidationError) +async def validation_exception_handler(_, exc): + err = openai_serving_chat.create_error_response(message=str(exc)) + return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) + + +@app.get("/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.get("/v1/models") +async def show_available_models(): + models = await openai_serving_chat.show_available_models() + return JSONResponse(content=models.model_dump()) + + +@app.post("/v1/chat/completions") +async def create_chat_completion(request: ChatCompletionRequest, + raw_request: Request): + generator = await openai_serving_chat.create_chat_completion( + request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + if request.stream: + return StreamingResponse(content=generator, + media_type="text/event-stream") + else: + return JSONResponse(content=generator.model_dump()) + + +@app.post("/v1/completions") +async def create_completion(request: CompletionRequest, raw_request: Request): + generator = await openai_serving_completion.create_completion( + request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + if request.stream: + return StreamingResponse(content=generator, + media_type="text/event-stream") + else: + return JSONResponse(content=generator.model_dump()) + + +if __name__ == "__main__": + args = parse_args() + + app.add_middleware( + CORSMiddleware, + allow_origins=args.allowed_origins, + allow_credentials=args.allow_credentials, + allow_methods=args.allowed_methods, + allow_headers=args.allowed_headers, + ) + + if token := os.environ.get("VLLM_API_KEY") or args.api_key: + + @app.middleware("http") + async def authentication(request: Request, call_next): + if not request.url.path.startswith("/v1"): + return await call_next(request) + if request.headers.get("Authorization") != "Bearer " + token: + return JSONResponse(content={"error": "Unauthorized"}, + status_code=401) + return await call_next(request) + + for middleware in args.middleware: + module_path, object_name = middleware.rsplit(".", 1) + imported = getattr(importlib.import_module(module_path), object_name) + if inspect.isclass(imported): + app.add_middleware(imported) + elif inspect.iscoroutinefunction(imported): + app.middleware("http")(imported) + else: + raise ValueError( + f"Invalid middleware {middleware}. Must be a function or a class." + ) + + logger.info(f"args: {args}") + + if args.served_model_name is not None: + served_model = args.served_model_name + else: + served_model = args.model + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = AsyncLLMEngine.from_engine_args(engine_args) + openai_serving_chat = OpenAIServingChat(engine, served_model, + args.response_role, + args.lora_modules, + args.chat_template) + openai_serving_completion = OpenAIServingCompletion( + engine, served_model, args.lora_modules) + + app.root_path = args.root_path + uvicorn.run(app, + host=args.host, + port=args.port, + log_level="info", + timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py new file mode 100644 index 0000000..26499b8 --- /dev/null +++ b/vllm/entrypoints/openai/protocol.py @@ -0,0 +1,323 @@ +# Adapted from +# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py +import time +from typing import Dict, List, Literal, Optional, Union + +from pydantic import BaseModel, Field, model_validator + +from vllm.utils import random_uuid +from vllm.sampling_params import SamplingParams + +import torch + + +class ErrorResponse(BaseModel): + object: str = "error" + message: str + type: str + param: Optional[str] = None + code: int + + +class ModelPermission(BaseModel): + id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") + object: str = "model_permission" + created: int = Field(default_factory=lambda: int(time.time())) + allow_create_engine: bool = False + allow_sampling: bool = True + allow_logprobs: bool = True + allow_search_indices: bool = False + allow_view: bool = True + allow_fine_tuning: bool = False + organization: str = "*" + group: Optional[str] = None + is_blocking: str = False + + +class ModelCard(BaseModel): + id: str + object: str = "model" + created: int = Field(default_factory=lambda: int(time.time())) + owned_by: str = "vllm" + root: Optional[str] = None + parent: Optional[str] = None + permission: List[ModelPermission] = Field(default_factory=list) + + +class ModelList(BaseModel): + object: str = "list" + data: List[ModelCard] = Field(default_factory=list) + + +class UsageInfo(BaseModel): + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens: Optional[int] = 0 + + +class ChatCompletionRequest(BaseModel): + model: str + messages: List[Dict[str, str]] + temperature: Optional[float] = 0.7 + top_p: Optional[float] = 1.0 + n: Optional[int] = 1 + max_tokens: Optional[int] = None + seed: Optional[int] = None + stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stream: Optional[bool] = False + logprobs: Optional[bool] = False + top_logprobs: Optional[int] = None + presence_penalty: Optional[float] = 0.0 + frequency_penalty: Optional[float] = 0.0 + logit_bias: Optional[Dict[str, float]] = None + user: Optional[str] = None + # Additional parameters supported by vLLM + best_of: Optional[int] = None + top_k: Optional[int] = -1 + ignore_eos: Optional[bool] = False + use_beam_search: Optional[bool] = False + early_stopping: Optional[bool] = False + stop_token_ids: Optional[List[int]] = Field(default_factory=list) + skip_special_tokens: Optional[bool] = True + spaces_between_special_tokens: Optional[bool] = True + add_generation_prompt: Optional[bool] = True + echo: Optional[bool] = False + repetition_penalty: Optional[float] = 1.0 + min_p: Optional[float] = 0.0 + include_stop_str_in_output: Optional[bool] = False + length_penalty: Optional[float] = 1.0 + guided_json: Optional[Union[str, dict, BaseModel]] = None + guided_regex: Optional[str] = None + guided_choice: Optional[List[str]] = None + + def to_sampling_params(self) -> SamplingParams: + if self.logprobs and not self.top_logprobs: + raise ValueError("Top logprobs must be set when logprobs is.") + + logits_processors = None + if self.logit_bias: + + def logit_bias_logits_processor( + token_ids: List[int], + logits: torch.Tensor) -> torch.Tensor: + for token_id, bias in self.logit_bias.items(): + # Clamp the bias between -100 and 100 per OpenAI API spec + bias = min(100, max(-100, bias)) + logits[int(token_id)] += bias + return logits + + logits_processors = [logit_bias_logits_processor] + + return SamplingParams( + n=self.n, + presence_penalty=self.presence_penalty, + frequency_penalty=self.frequency_penalty, + repetition_penalty=self.repetition_penalty, + temperature=self.temperature, + top_p=self.top_p, + min_p=self.min_p, + seed=self.seed, + stop=self.stop, + stop_token_ids=self.stop_token_ids, + max_tokens=self.max_tokens, + logprobs=self.top_logprobs if self.logprobs else None, + prompt_logprobs=self.top_logprobs if self.echo else None, + best_of=self.best_of, + top_k=self.top_k, + ignore_eos=self.ignore_eos, + use_beam_search=self.use_beam_search, + early_stopping=self.early_stopping, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self.spaces_between_special_tokens, + include_stop_str_in_output=self.include_stop_str_in_output, + length_penalty=self.length_penalty, + logits_processors=logits_processors, + ) + + @model_validator(mode="before") + @classmethod + def check_guided_decoding_count(cls, data): + guide_count = sum([ + "guided_json" in data and data["guided_json"] is not None, + "guided_regex" in data and data["guided_regex"] is not None, + "guided_choice" in data and data["guided_choice"] is not None + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding " + "('guided_json', 'guided_regex' or 'guided_choice').") + return data + + +class CompletionRequest(BaseModel): + model: str + # a string, array of strings, array of tokens, or array of token arrays + prompt: Union[List[int], List[List[int]], str, List[str]] + suffix: Optional[str] = None + max_tokens: Optional[int] = 16 + temperature: Optional[float] = 1.0 + top_p: Optional[float] = 1.0 + n: Optional[int] = 1 + stream: Optional[bool] = False + logprobs: Optional[int] = None + echo: Optional[bool] = False + stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + seed: Optional[int] = None + presence_penalty: Optional[float] = 0.0 + frequency_penalty: Optional[float] = 0.0 + best_of: Optional[int] = None + logit_bias: Optional[Dict[str, float]] = None + user: Optional[str] = None + # Additional parameters supported by vLLM + top_k: Optional[int] = -1 + ignore_eos: Optional[bool] = False + use_beam_search: Optional[bool] = False + early_stopping: Optional[bool] = False + stop_token_ids: Optional[List[int]] = Field(default_factory=list) + skip_special_tokens: Optional[bool] = True + spaces_between_special_tokens: Optional[bool] = True + repetition_penalty: Optional[float] = 1.0 + min_p: Optional[float] = 0.0 + include_stop_str_in_output: Optional[bool] = False + length_penalty: Optional[float] = 1.0 + guided_json: Optional[Union[str, dict, BaseModel]] = None + guided_regex: Optional[str] = None + guided_choice: Optional[List[str]] = None + + def to_sampling_params(self): + echo_without_generation = self.echo and self.max_tokens == 0 + + logits_processors = None + if self.logit_bias: + + def logit_bias_logits_processor( + token_ids: List[int], + logits: torch.Tensor) -> torch.Tensor: + for token_id, bias in self.logit_bias.items(): + # Clamp the bias between -100 and 100 per OpenAI API spec + bias = min(100, max(-100, bias)) + logits[int(token_id)] += bias + return logits + + logits_processors = [logit_bias_logits_processor] + + return SamplingParams( + n=self.n, + best_of=self.best_of, + presence_penalty=self.presence_penalty, + frequency_penalty=self.frequency_penalty, + repetition_penalty=self.repetition_penalty, + temperature=self.temperature, + top_p=self.top_p, + top_k=self.top_k, + min_p=self.min_p, + seed=self.seed, + stop=self.stop, + stop_token_ids=self.stop_token_ids, + ignore_eos=self.ignore_eos, + max_tokens=self.max_tokens if not echo_without_generation else 1, + logprobs=self.logprobs, + use_beam_search=self.use_beam_search, + early_stopping=self.early_stopping, + prompt_logprobs=self.logprobs if self.echo else None, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=(self.spaces_between_special_tokens), + include_stop_str_in_output=self.include_stop_str_in_output, + length_penalty=self.length_penalty, + logits_processors=logits_processors, + ) + + @model_validator(mode="before") + @classmethod + def check_guided_decoding_count(cls, data): + guide_count = sum([ + "guided_json" in data and data["guided_json"] is not None, + "guided_regex" in data and data["guided_regex"] is not None, + "guided_choice" in data and data["guided_choice"] is not None + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding " + "('guided_json', 'guided_regex' or 'guided_choice').") + return data + + +class LogProbs(BaseModel): + text_offset: List[int] = Field(default_factory=list) + token_logprobs: List[Optional[float]] = Field(default_factory=list) + tokens: List[str] = Field(default_factory=list) + top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None + + +class CompletionResponseChoice(BaseModel): + index: int + text: str + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length"]] = None + + +class CompletionResponse(BaseModel): + id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[CompletionResponseChoice] + usage: UsageInfo + + +class CompletionResponseStreamChoice(BaseModel): + index: int + text: str + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length"]] = None + + +class CompletionStreamResponse(BaseModel): + id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[CompletionResponseStreamChoice] + usage: Optional[UsageInfo] = Field(default=None) + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ChatCompletionResponseChoice(BaseModel): + index: int + message: ChatMessage + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length"]] = None + + +class ChatCompletionResponse(BaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") + object: str = "chat.completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[ChatCompletionResponseChoice] + usage: UsageInfo + + +class DeltaMessage(BaseModel): + role: Optional[str] = None + content: Optional[str] = None + + +class ChatCompletionResponseStreamChoice(BaseModel): + index: int + delta: DeltaMessage + logprobs: Optional[LogProbs] = None + finish_reason: Optional[Literal["stop", "length"]] = None + + +class ChatCompletionStreamResponse(BaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") + object: str = "chat.completion.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[ChatCompletionResponseStreamChoice] + usage: Optional[UsageInfo] = Field(default=None) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py new file mode 100644 index 0000000..f4ad0aa --- /dev/null +++ b/vllm/entrypoints/openai/serving_chat.py @@ -0,0 +1,307 @@ +import time +import codecs +from fastapi import Request +from typing import AsyncGenerator, AsyncIterator, Optional, List, Union +from vllm.logger import init_logger +from vllm.utils import random_uuid +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, ChatCompletionResponse, + ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, + UsageInfo) +from vllm.outputs import RequestOutput +from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA +from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor + +logger = init_logger(__name__) + + +class OpenAIServingChat(OpenAIServing): + + def __init__(self, + engine: AsyncLLMEngine, + served_model: str, + response_role: str, + lora_modules: Optional[List[LoRA]] = None, + chat_template=None): + super().__init__(engine=engine, + served_model=served_model, + lora_modules=lora_modules) + self.response_role = response_role + self._load_chat_template(chat_template) + + async def create_chat_completion( + self, request: ChatCompletionRequest, raw_request: Request + ) -> Union[ErrorResponse, AsyncGenerator[str, None], + ChatCompletionResponse]: + """Completion API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/chat/create + for the API specification. This API mimics the OpenAI ChatCompletion API. + + NOTE: Currently we do not support the following feature: + - function_call (Users should implement this by themselves) + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + try: + prompt = self.tokenizer.apply_chat_template( + conversation=request.messages, + tokenize=False, + add_generation_prompt=request.add_generation_prompt) + except Exception as e: + logger.error( + f"Error in applying chat template from request: {str(e)}") + return self.create_error_response(str(e)) + + request_id = f"cmpl-{random_uuid()}" + try: + token_ids = self._validate_prompt_and_tokenize(request, + prompt=prompt) + sampling_params = request.to_sampling_params() + lora_request = self._maybe_get_lora(request) + guided_decode_logits_processor = ( + await get_guided_decoding_logits_processor( + request, self.engine.get_tokenizer())) + if guided_decode_logits_processor: + if sampling_params.logits_processors is None: + sampling_params.logits_processors = [] + sampling_params.logits_processors.append( + guided_decode_logits_processor) + except ValueError as e: + return self.create_error_response(str(e)) + + result_generator = self.engine.generate(prompt, sampling_params, + request_id, token_ids, + lora_request) + # Streaming response + if request.stream: + return self.chat_completion_stream_generator( + request, result_generator, request_id) + else: + return await self.chat_completion_full_generator( + request, raw_request, result_generator, request_id) + + def get_chat_request_role(self, request: ChatCompletionRequest) -> str: + if request.add_generation_prompt: + return self.response_role + else: + return request.messages[-1]["role"] + + async def chat_completion_stream_generator( + self, request: ChatCompletionRequest, + result_generator: AsyncIterator[RequestOutput], request_id: str + ) -> Union[ErrorResponse, AsyncGenerator[str, None]]: + + model_name = request.model + created_time = int(time.monotonic()) + chunk_object_type = "chat.completion.chunk" + + # Send first response for each request.n (index) with the role + role = self.get_chat_request_role(request) + for i in range(request.n): + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(role=role), + logprobs=None, + finish_reason=None) + chunk = ChatCompletionStreamResponse(id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # Send response to echo the input portion of the last message + if request.echo: + last_msg_content = "" + if request.messages and isinstance( + request.messages, list) and request.messages[-1].get( + "content") and request.messages[-1].get( + "role") == role: + last_msg_content = request.messages[-1]["content"] + + if last_msg_content: + for i in range(request.n): + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(content=last_msg_content), + finish_reason=None) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + logprobs=None, + model=model_name) + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # Send response for each token for each request.n (index) + previous_texts = [""] * request.n + previous_num_tokens = [0] * request.n + finish_reason_sent = [False] * request.n + async for res in result_generator: + res: RequestOutput + for output in res.outputs: + i = output.index + + if finish_reason_sent[i]: + continue + + delta_token_ids = output.token_ids[previous_num_tokens[i]:] + top_logprobs = output.logprobs[ + previous_num_tokens[i]:] if output.logprobs else None + + if request.logprobs: + logprobs = self._create_logprobs( + token_ids=delta_token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + initial_text_offset=len(previous_texts[i]), + ) + else: + logprobs = None + + delta_text = output.text[len(previous_texts[i]):] + previous_texts[i] = output.text + previous_num_tokens[i] = len(output.token_ids) + if output.finish_reason is None: + # Send token-by-token response for each request.n + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(content=delta_text), + logprobs=logprobs, + finish_reason=None) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + else: + # Send the finish response for each request.n only once + prompt_tokens = len(res.prompt_token_ids) + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + previous_num_tokens[i], + ) + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(content=delta_text), + logprobs=logprobs, + finish_reason=output.finish_reason) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + if final_usage is not None: + chunk.usage = final_usage + data = chunk.model_dump_json(exclude_unset=True, + exclude_none=True) + yield f"data: {data}\n\n" + finish_reason_sent[i] = True + # Send the final done message after all response.n are finished + yield "data: [DONE]\n\n" + + async def chat_completion_full_generator( + self, request: ChatCompletionRequest, raw_request: Request, + result_generator: AsyncIterator[RequestOutput], + request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]: + + model_name = request.model + created_time = int(time.monotonic()) + final_res: RequestOutput = None + + async for res in result_generator: + if await raw_request.is_disconnected(): + # Abort the request if the client disconnects. + await self.engine.abort(request_id) + return self.create_error_response("Client disconnected") + final_res = res + assert final_res is not None + + choices = [] + + role = self.get_chat_request_role(request) + for output in final_res.outputs: + token_ids = output.token_ids + top_logprobs = output.logprobs + + if request.logprobs: + logprobs = self._create_logprobs( + token_ids=token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + ) + else: + logprobs = None + + choice_data = ChatCompletionResponseChoice( + index=output.index, + message=ChatMessage(role=role, content=output.text), + logprobs=logprobs, + finish_reason=output.finish_reason, + ) + choices.append(choice_data) + + if request.echo: + last_msg_content = "" + if request.messages and isinstance( + request.messages, list) and request.messages[-1].get( + "content") and request.messages[-1].get( + "role") == role: + last_msg_content = request.messages[-1]["content"] + + for choice in choices: + full_message = last_msg_content + choice.message.content + choice.message.content = full_message + + num_prompt_tokens = len(final_res.prompt_token_ids) + num_generated_tokens = sum( + len(output.token_ids) for output in final_res.outputs) + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + ) + response = ChatCompletionResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + ) + + return response + + def _load_chat_template(self, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + self.tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + self.tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logger.info( + f"Using supplied chat template:\n{self.tokenizer.chat_template}" + ) + elif self.tokenizer.chat_template is not None: + logger.info( + f"Using default chat template:\n{self.tokenizer.chat_template}" + ) + else: + logger.warning( + "No chat template provided. Chat API will not work.") diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py new file mode 100644 index 0000000..99a1019 --- /dev/null +++ b/vllm/entrypoints/openai/serving_completion.py @@ -0,0 +1,361 @@ +import asyncio +import time +from fastapi import Request +from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple +from vllm.logger import init_logger +from vllm.utils import random_uuid +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.protocol import ( + CompletionRequest, + CompletionResponse, + CompletionResponseChoice, + CompletionResponseStreamChoice, + CompletionStreamResponse, + LogProbs, + UsageInfo, +) +from vllm.outputs import RequestOutput +from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA +from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor + +logger = init_logger(__name__) + +TypeTokenIDs = List[int] +TypeTopLogProbs = List[Optional[Dict[int, float]]] +TypeCreateLogProbsFn = Callable[ + [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs] + + +async def completion_stream_generator( + request: CompletionRequest, + raw_request: Request, + on_abort, + result_generator: AsyncIterator[Tuple[int, RequestOutput]], + create_logprobs_fn: TypeCreateLogProbsFn, + request_id: str, + created_time: int, + model_name: str, + num_prompts: int, +) -> AsyncGenerator[str, None]: + previous_texts = [""] * request.n * num_prompts + previous_num_tokens = [0] * request.n * num_prompts + has_echoed = [False] * request.n * num_prompts + + async for prompt_idx, res in result_generator: + + # Abort the request if the client disconnects. + if await raw_request.is_disconnected(): + await on_abort(f"{request_id}-{prompt_idx}") + raise StopAsyncIteration() + + for output in res.outputs: + i = output.index + prompt_idx * request.n + # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. + + if request.echo and request.max_tokens == 0: + # only return the prompt + delta_text = res.prompt + delta_token_ids = res.prompt_token_ids + top_logprobs = res.prompt_logprobs + has_echoed[i] = True + elif request.echo and request.max_tokens > 0 and not has_echoed[i]: + # echo the prompt and first token + delta_text = res.prompt + output.text + delta_token_ids = res.prompt_token_ids + output.token_ids + top_logprobs = res.prompt_logprobs + (output.logprobs or []) + has_echoed[i] = True + else: + # return just the delta + delta_text = output.text[len(previous_texts[i]):] + delta_token_ids = output.token_ids[previous_num_tokens[i]:] + top_logprobs = output.logprobs[ + previous_num_tokens[i]:] if output.logprobs else None + + if request.logprobs is not None: + assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" + logprobs = create_logprobs_fn( + token_ids=delta_token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + initial_text_offset=len(previous_texts[i]), + ) + else: + logprobs = None + + previous_texts[i] = output.text + previous_num_tokens[i] = len(output.token_ids) + finish_reason = output.finish_reason + response_json = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[ + CompletionResponseStreamChoice( + index=i, + text=delta_text, + logprobs=logprobs, + finish_reason=finish_reason, + ) + ]).model_dump_json() + yield f"data: {response_json}\n\n" + + if output.finish_reason is not None: # return final usage + logprobs = LogProbs() if request.logprobs is not None else None + prompt_tokens = len(res.prompt_token_ids) + completion_tokens = len(output.token_ids) + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + response_json = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[ + CompletionResponseStreamChoice( + index=i, + text="", + logprobs=logprobs, + finish_reason=output.finish_reason, + ) + ], + usage=final_usage, + ).model_dump_json() + yield f"data: {response_json}\n\n" + + yield "data: [DONE]\n\n" + + +def parse_prompt_format(prompt) -> Tuple[bool, list]: + # get the prompt, openai supports the following + # "a string, array of strings, array of tokens, or array of token arrays." + prompt_is_tokens = False + prompts = [prompt] # case 1: a string + if isinstance(prompt, list): + if len(prompt) == 0: + raise ValueError("please provide at least one prompt") + elif isinstance(prompt[0], str): + prompt_is_tokens = False + prompts = prompt # case 2: array of strings + elif isinstance(prompt[0], int): + prompt_is_tokens = True + prompts = [prompt] # case 3: array of tokens + elif isinstance(prompt[0], list) and isinstance(prompt[0][0], int): + prompt_is_tokens = True + prompts = prompt # case 4: array of token arrays + else: + raise ValueError( + "prompt must be a string, array of strings, array of tokens, or array of token arrays" + ) + return prompt_is_tokens, prompts + + +def request_output_to_completion_response( + final_res_batch: List[RequestOutput], + request: CompletionRequest, + create_logprobs_fn: TypeCreateLogProbsFn, + request_id: str, + created_time: int, + model_name: str, +) -> CompletionResponse: + choices = [] + num_prompt_tokens = 0 + num_generated_tokens = 0 + for final_res in final_res_batch: + assert final_res is not None + prompt_token_ids = final_res.prompt_token_ids + prompt_logprobs = final_res.prompt_logprobs + prompt_text = final_res.prompt + + for output in final_res.outputs: + if request.echo and request.max_tokens == 0: + token_ids = prompt_token_ids + top_logprobs = prompt_logprobs + output_text = prompt_text + elif request.echo and request.max_tokens > 0: + token_ids = prompt_token_ids + output.token_ids + top_logprobs = prompt_logprobs + output.logprobs + output_text = prompt_text + output.text + else: + token_ids = output.token_ids + top_logprobs = output.logprobs + output_text = output.text + + if request.logprobs is not None: + logprobs = create_logprobs_fn( + token_ids=token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + ) + else: + logprobs = None + + choice_data = CompletionResponseChoice( + index=len(choices), + text=output_text, + logprobs=logprobs, + finish_reason=output.finish_reason, + ) + choices.append(choice_data) + + num_prompt_tokens += len(prompt_token_ids) + num_generated_tokens += sum( + len(output.token_ids) for output in final_res.outputs) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + ) + + return CompletionResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + ) + + +def merge_async_iterators(*iterators): + """Merge multiple asynchronous iterators into a single iterator. + + This method handle the case where some iterators finish before others. + When it yields, it yields a tuple (i, item) where i is the index of the + iterator that yields the item. + """ + queue = asyncio.Queue() + + finished = [False] * len(iterators) + + async def producer(i, iterator): + async for item in iterator: + await queue.put((i, item)) + finished[i] = True + + _tasks = [ + asyncio.create_task(producer(i, iterator)) + for i, iterator in enumerate(iterators) + ] + + async def consumer(): + while not all(finished) or not queue.empty(): + item = await queue.get() + yield item + await asyncio.gather(*_tasks) + + return consumer() + + +class OpenAIServingCompletion(OpenAIServing): + + def __init__(self, + engine: AsyncLLMEngine, + served_model: str, + lora_modules: Optional[List[LoRA]] = None): + super().__init__(engine=engine, + served_model=served_model, + lora_modules=lora_modules) + + async def create_completion(self, request: CompletionRequest, + raw_request: Request): + """Completion API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/completions/create + for the API specification. This API mimics the OpenAI Completion API. + + NOTE: Currently we do not support the following feature: + - suffix (the language models we currently support do not support + suffix) + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # Return error for unsupported features. + if request.suffix is not None: + return self.create_error_response( + "suffix is not currently supported") + + model_name = request.model + request_id = f"cmpl-{random_uuid()}" + created_time = int(time.monotonic()) + + # Schedule the request and get the result generator. + generators = [] + try: + sampling_params = request.to_sampling_params() + lora_request = self._maybe_get_lora(request) + guided_decode_logit_processor = ( + await get_guided_decoding_logits_processor( + request, self.engine.get_tokenizer())) + if guided_decode_logit_processor is not None: + if sampling_params.logits_processors is None: + sampling_params.logits_processors = [] + sampling_params.logits_processors.append( + guided_decode_logit_processor) + prompt_is_tokens, prompts = parse_prompt_format(request.prompt) + + for i, prompt in enumerate(prompts): + if prompt_is_tokens: + input_ids = self._validate_prompt_and_tokenize( + request, prompt_ids=prompt) + else: + input_ids = self._validate_prompt_and_tokenize( + request, prompt=prompt) + + generators.append( + self.engine.generate(prompt, + sampling_params, + f"{request_id}-{i}", + prompt_token_ids=input_ids, + lora_request=lora_request)) + except ValueError as e: + return self.create_error_response(str(e)) + + result_generator: AsyncIterator[Tuple[ + int, RequestOutput]] = merge_async_iterators(*generators) + + # Similar to the OpenAI API, when n != best_of, we do not stream the + # results. In addition, we do not stream the results when use beam search. + stream = (request.stream + and (request.best_of is None or request.n == request.best_of) + and not request.use_beam_search) + + # Streaming response + if stream: + return completion_stream_generator(request, + raw_request, + self.engine.abort, + result_generator, + self._create_logprobs, + request_id, + created_time, + model_name, + num_prompts=len(prompts)) + + # Non-streaming response + final_res_batch: RequestOutput = [None] * len(prompts) + async for i, res in result_generator: + if await raw_request.is_disconnected(): + # Abort the request if the client disconnects. + await self.engine.abort(f"{request_id}-{i}") + return self.create_error_response("Client disconnected") + final_res_batch[i] = res + response = request_output_to_completion_response( + final_res_batch, request, self._create_logprobs, request_id, + created_time, model_name) + + # When user requests streaming but we don't stream, we still need to + # return a streaming response with a single event. + if request.stream: + response_json = response.model_dump_json() + + async def fake_stream_generator() -> AsyncGenerator[str, None]: + yield f"data: {response_json}\n\n" + yield "data: [DONE]\n\n" + + return fake_stream_generator() + + return response diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py new file mode 100644 index 0000000..0994547 --- /dev/null +++ b/vllm/entrypoints/openai/serving_engine.py @@ -0,0 +1,172 @@ +import asyncio +from dataclasses import dataclass +from http import HTTPStatus +from typing import Dict, List, Optional, Union +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest, + ErrorResponse, LogProbs, + ModelCard, ModelList, + ModelPermission) +from vllm.lora.request import LoRARequest + +logger = init_logger(__name__) + + +@dataclass +class LoRA: + name: str + local_path: str + + +class OpenAIServing: + + def __init__(self, + engine: AsyncLLMEngine, + served_model: str, + lora_modules=Optional[List[LoRA]]): + self.engine = engine + self.served_model = served_model + if lora_modules is None: + self.lora_requests = [] + else: + self.lora_requests = [ + LoRARequest( + lora_name=lora.name, + lora_int_id=i, + lora_local_path=lora.local_path, + ) for i, lora in enumerate(lora_modules, start=1) + ] + + self.max_model_len = 0 + self.tokenizer = None + + try: + event_loop = asyncio.get_running_loop() + except RuntimeError: + event_loop = None + + if event_loop is not None and event_loop.is_running( + ): # If the current is instanced by Ray Serve, there is already a running event loop + event_loop.create_task(self._post_init()) + else: # When using single vLLM without engine_use_ray + asyncio.run(self._post_init()) + + async def _post_init(self): + engine_model_config = await self.engine.get_model_config() + self.max_model_len = engine_model_config.max_model_len + + # A separate tokenizer to map token IDs to strings. + self.tokenizer = get_tokenizer( + engine_model_config.tokenizer, + tokenizer_mode=engine_model_config.tokenizer_mode, + trust_remote_code=engine_model_config.trust_remote_code) + + async def show_available_models(self) -> ModelList: + """Show available models. Right now we only have one model.""" + model_cards = [ + ModelCard(id=self.served_model, + root=self.served_model, + permission=[ModelPermission()]) + ] + lora_cards = [ + ModelCard(id=lora.lora_name, + root=self.served_model, + permission=[ModelPermission()]) + for lora in self.lora_requests + ] + model_cards.extend(lora_cards) + return ModelList(data=model_cards) + + def _create_logprobs( + self, + token_ids: List[int], + top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None, + num_output_top_logprobs: Optional[int] = None, + initial_text_offset: int = 0, + ) -> LogProbs: + """Create OpenAI-style logprobs.""" + logprobs = LogProbs() + last_token_len = 0 + if num_output_top_logprobs: + logprobs.top_logprobs = [] + for i, token_id in enumerate(token_ids): + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is not None: + token_logprob = step_top_logprobs[token_id] + else: + token_logprob = None + token = self.tokenizer.convert_ids_to_tokens(token_id) + logprobs.tokens.append(token) + logprobs.token_logprobs.append(token_logprob) + if len(logprobs.text_offset) == 0: + logprobs.text_offset.append(initial_text_offset) + else: + logprobs.text_offset.append(logprobs.text_offset[-1] + + last_token_len) + last_token_len = len(token) + + if num_output_top_logprobs: + logprobs.top_logprobs.append({ + self.tokenizer.convert_ids_to_tokens(i): p + for i, p in step_top_logprobs.items() + } if step_top_logprobs else None) + return logprobs + + def create_error_response( + self, + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: + return ErrorResponse(message=message, + type=err_type, + code=status_code.value) + + async def _check_model(self, request) -> Optional[ErrorResponse]: + if request.model == self.served_model: + return + if request.model in [lora.lora_name for lora in self.lora_requests]: + return + return self.create_error_response( + message=f"The model `{request.model}` does not exist.", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND) + + def _maybe_get_lora(self, request) -> Optional[LoRARequest]: + if request.model == self.served_model: + return + for lora in self.lora_requests: + if request.model == lora.lora_name: + return lora + # if _check_model has been called earlier, this will be unreachable + raise ValueError("The model `{request.model}` does not exist.") + + def _validate_prompt_and_tokenize( + self, + request: Union[ChatCompletionRequest, CompletionRequest], + prompt: Optional[str] = None, + prompt_ids: Optional[List[int]] = None) -> List[int]: + if not (prompt or prompt_ids): + raise ValueError("Either prompt or prompt_ids should be provided.") + if (prompt and prompt_ids): + raise ValueError( + "Only one of prompt or prompt_ids should be provided.") + + input_ids = prompt_ids if prompt_ids is not None else self.tokenizer( + prompt).input_ids + token_num = len(input_ids) + + if request.max_tokens is None: + request.max_tokens = self.max_model_len - token_num + + if token_num + request.max_tokens > self.max_model_len: + raise ValueError( + f"This model's maximum context length is {self.max_model_len} tokens. " + f"However, you requested {request.max_tokens + token_num} tokens " + f"({token_num} in the messages, " + f"{request.max_tokens} in the completion). " + f"Please reduce the length of the messages or completion.", ) + else: + return input_ids diff --git a/vllm/logger.py b/vllm/logger.py new file mode 100644 index 0000000..d25fcef --- /dev/null +++ b/vllm/logger.py @@ -0,0 +1,61 @@ +# Adapted from +# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py +"""Logging configuration for vLLM.""" +import logging +import sys +import os + +VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")) + +_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" +_DATE_FORMAT = "%m-%d %H:%M:%S" + + +class NewLineFormatter(logging.Formatter): + """Adds logging prefix to newlines to align multi-line messages.""" + + def __init__(self, fmt, datefmt=None): + logging.Formatter.__init__(self, fmt, datefmt) + + def format(self, record): + msg = logging.Formatter.format(self, record) + if record.message != "": + parts = msg.split(record.message) + msg = msg.replace("\n", "\r\n" + parts[0]) + return msg + + +_root_logger = logging.getLogger("vllm") +_default_handler = None + + +def _setup_logger(): + _root_logger.setLevel(logging.DEBUG) + global _default_handler + if _default_handler is None: + _default_handler = logging.StreamHandler(sys.stdout) + _default_handler.flush = sys.stdout.flush # type: ignore + _default_handler.setLevel(logging.INFO) + _root_logger.addHandler(_default_handler) + fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) + _default_handler.setFormatter(fmt) + # Setting this will avoid the message + # being propagated to the parent logger. + _root_logger.propagate = False + + +# The logger is initialized when the module is imported. +# This is thread-safe as the module is only imported once, +# guaranteed by the Python GIL. +if VLLM_CONFIGURE_LOGGING: + _setup_logger() + + +def init_logger(name: str): + # Use the same settings as above for root logger + logger = logging.getLogger(name) + logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG")) + if VLLM_CONFIGURE_LOGGING: + logger.addHandler(_default_handler) + logger.propagate = False + return logger diff --git a/vllm/lora/__init__.py b/vllm/lora/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/lora/__pycache__/__init__.cpython-310.pyc b/vllm/lora/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f9bedbc423ef89009773d114a83df14d8d02ca4 GIT binary patch literal 154 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hmera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!zaP+d`?etdjpUS>&ryk0@& WEe@O9{FKt1R6CHd#Y{kgg#iGhJR>du literal 0 HcmV?d00001 diff --git a/vllm/lora/__pycache__/layers.cpython-310.pyc b/vllm/lora/__pycache__/layers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c23642f8309fffcb6b39669350db68a80e57bf39 GIT binary patch literal 24686 zcmeHvTW}oLnchrKPtOIwU;u(307}&GPEaIBk-lnaEhXw|OOz~WWvAEP(GcAO3^*6? zbb}WJw7sGfd9y3Wo82vElZ`_<&V@;;cGpQNiK|lCr1Fx7Iq7D~=6|D^ZW*?bra#lnv`pJ-KcLHl6q zkbOwPqs_yuBleMW$~Bc=NI7F}er4V%i90TCS=hY>z3;ZGOR&MD^zzvj%NhDY5A=W*9^$I+%^?g_O0xU=wX*8WVGZkKby-9I4Z zr_rjD&J#%aSttEQYX0Dm^xAXPb1y!3?n^J7e<3hm zZg_r>e|6PwblSCMkbQG)wdo>L`8DPGs@7<`&bdyz-dI9h@#W6zPlusQkgqvTwb@a% zpcMJ4UbC_22IhsO)@7v6y-{nSG&P?ID!$wHI;z_0IBv7LT2r-V6CSwLrJBF&s;yEe z*I7fcj|?tD=SS0~&UKn=t@dkS;V+}^H5E*}-nr5r_2n{s@A?392oxoN=G%Fs@GU&_wV*r zn~k`nTC>w$s$vBFs?%r%>8nAB9bc_gJ&f!{*9pTH;&86fb}$rPV0G5~)ir-H%h?eA zEd0;oIfuu44bFynJ-d;*31+jAT1h+U+0-rLI$|ZpjAP7=3}bGkuV;K?!`w)1q&JL> zjAJ;N8(=_4pV`RX;^Z@H$H2XwUPU8Zufn=k{7%KKEiP9YZB%EWSghdp=5oWUv|N9= z<5X&4VoV9i;PvIGyyREbJd}pnBALS$RxZ1Xen&C2BlkM5<$9 z|99K-;eGs68k6zASm6V2K+S9uGo*MgBMhyI%MYz3uxqj1dW$@FBpn4ZF;4k02K3%gC2ptrC{GK+q! zJ97!iy#o(O&U!eKbByab-|#_VH_|uK&!)aUcRlZ$AsOapQy@jULXI1`jVwbq3LwkS z|Jl^_+@-xB((Hy666t3%|DQ;~HwF@Ek~e`%f%Q0*I{1v97RXccD>da-7CWug4qt+S zv_SR^taue~?Sf7xw0cC(i1`X<8B+QktO8i`LN2&Xb^!8zXogU!oUXV{x8=6|iqDmu zwftGQ!~+X_Sk}?gsx6pDPr753OB!kjbF4gGId$~nIr^2Sc~Ki|esiD55J_GAwnNiT1;>M>_So2ME!0>%Lvkm1;Owg^6kBAd&^M362Cb#aUHtOX{ca+M9u6 z?$-RtOqbK6=~CKCoA*i?@T#MDxrnFd2S$)Oi^uy695-dBos@0-Qp(M^rjsUScC(Ig zrZ=29lX6Ta`)qCT!s_}PdyH&j>jVgE2hesvN38%?51HEL5j0X`VO3ESe&#A zCXa#1U{Ejs^}J&_IfyfslXo+9!znmLgmMfyBM_1b&ZxL!Zjt{X0Og&MGmbPP4n?Gb zLouu1Op3ch+$nd|g;)jgW2ZBXlw(d7L%b`Pc&6sLFLP=9O2c1f{{@9s?W#r{!H7VT zMG+W+Q4RLPkOg^SO|{Q2viGAO{T%yqcjyVAW??{%AS)Aj zk(r_&{ZYl$7Z8{?)*pY}1?dXZ)HpEwVg)OO#+VmCsRXJgqgEKY6=zXEf8C2fkz@pb zBG>R*SJ_y9WOJ3f2?I+*!tnovV8V@L6)%)ZdIW(i3hr;k-d8WX>WTs=ih#RFZeHAq zsP5TB-8p{hSvt?r=^KW8)R60VsT-1CpWED!C|NLi0h0_At3{Y`Pm7Up#B^T-I3U2W z6HF#r8)Y5|J>aPA9z;gU?Qi?+8`1cMOpD`|oOPCN^NVccOLV?S=Sy^6rqkE(5w<8S z>V3S|Z7-}pI=Jm&%K4Hy&B8uK=hJk^loh#!%F+2nI?uoX6YQwP<-mGV#z=8I)eCe? zI_K$ht0X?sh58&FJi_hssiI+|jr&Hrc)yf2Ov5nlnHl4!rfJ;GXNw;jMMT9hX4d%F zvPvIG83Xwh%1wDchSSThoJ`$(KUGi5k}+zVg^cs4V}XO^T=Fd_Z=mpiBnjR`{*|>0 z;9Xg8vr%UZ{HMqe6htE{qfqtGEh7*93pbl^CWS}skT+A}?i6=g++A+T%?dZ8vd6T? zojHj&=+;eVxAO>M%kQVmlsy4Hw{ti{(&Cq*p*{E+=Gn(FfqMW6#f*qj01 zzF~xKbzFKTwUOPW^BTH@nN%IVsxHG?fmq_C-UGBiTplU%Hw)A)q*HCnVI1D_jbB_D zp`Kx7R7x4EXXL-NQVRc%JH`iytCqu*Qd_3S*Nv44q&HS3`_k^vX&qhGn4~iYq?zWSykMuS9`04>{Sws(#^TZ`uB)#hhx$6RCHbw^d3nKD z1O`O+5EP|IX?H?Yuds-{EMh`0SdreUddksqyl_+URRHS&3%xxh`FeW7xoVGujjL{k z%)&H(7X^ANaPpQ3P;3~ZY4g5iSmb`-aaM*t>!(v$z-7de6`BCf#!vIvd^-OVvtTe5 zY4GN*X(8^YVx1)c%*)^l2Uqo{sA@b;ts`n zb=UtMjKKoAYDv{g!cK0Y;v{rkagd6!VvxAr1H2bMn8JGowfZX{UMJ(2A7I}Dsm%>2 zCy^$D^pYm@OWryN5$Q84**FJS)>>NKKu!xep>s)mhyLVu0H@>cqiGL**GKZWlb5^s;0=}7zzB!xH>QU~0yb(Y1mL4YcE9j#)Z>R0icBfsv z08Q0J2vMqv*!iL?_IenFYO;%+)oWEzdj;t?m5k~Iq|`E~%RC|3oMmk9=-9=Wp1uBQ zKza!Q#*)9}bDiU9@RO+(d{Tv0LMDM2Ps*((h8O?YCXKf`l}^V7-; zsUCg6gioM3k>z_xlyjUAO{)06;9(WHa5`8DFPhgmOJtT~PGFu!^W-Mxv?$cGH!(*y zGO}dmpnm5vin$L^G%!8O#@tNbENp;Arc$?9N_Ha)tzvei=ol9b_3B3U<_JQgEJgl< zPNpEUqx39jL8rO^jo=7EIu-wSQ_9+7yhwz9_)fhtzx{#mnz zFMx5SJYw^5dJic9k9ltom^~L06VzZrCv1Wq%rq7VE-iRp%uGw%`k@ERv9R5O4d*BO z1pz`yMc9`R35eDD$$5)v{)`Ks@_gi?pXW`^i z(}s~Z?{bYaAQ@PY3=Fe)w{*tHGOkBj*u7a=D1NkORI$WGJUu^-BY)nXzhgdO-um+?hhUrw6PdL08JR=xWk1A^#)$c zq)>0t_Z2!aTrzMnW}FwhF*%hD&-%fwKDjMHz(g@IN4t!GFF*r;1M4;TL1x9oS_^K0 zF!nMejP>sW_yOCav{5?pAm-bgmaAl{S`>E#V3V@mz$@Vr&=UGh-!*w6fbmba4)u51~M%5tC}8 z5@6(B^A@J$jrNEr1r zZe_6V54eID%J3w40A;v&4cGX~2dx-lgN0lfsZ(ud1XeZ=4NkabE%&i~Yva@<>*8}kg z71>Yc@OV_rpzY`*k0JuVS{qQ0Fb^=Fy=7e2+67b1`dRc0G$?IYO9-$-F10_Aiy#ZS zF02*C`j4L1(1#mNgDR?R2xuqpv|38o4=_fIE2>Xcv_24!P9WEAn`_RUca&Fy-+ICr})99t6n#2*$jNjnyjORRP2FC~-vW zF>R$2?#in%#DyYCVCl+8;$S@F&8wjikKWK931eAs|9}(GxDgp!BSpY1(e`Hlb?Hj~O$QX~pt4!9rR~@OS6?aJsP24!6A=GgKvK9qZjuJ6kYe!xyTN zor0TpvBgDc9;#%hjiE{|Vgt;o8_t+hf|_@PAt;eYSH_@3ri<+_{ufoUwlplkUXX8g zMsKj$hA;r1wj3N6a90Mw&IbF2+m1!^YyBsK24C9}Xj1}EWzv%1RM&){5}6MOx%HV4 zRF1ey3l$KTTg8B`U@LrWRr`)VrK7{}e6PtBt(>m7mF3!Hw{oh&Z9JF^iG3E?Uqg%( zDjir-E!UcLaw6C(=<0^$Jk#-)!~JlWJ%GVTQA~x+88~1q3wooR6K9BuoC^>iSqim^ zTw{skQus^EFEYky$%=mW^TXn1ozCf|+pJ89p%fj8BxZ@r&-Km9&bS`Mdd-p_5|{j(_=^5A_w@jf@; zJpeLN74eM%zqK;L^%gM#kLUB}MN6dR?O#Dn*PrEh4w`VM-k^PEMY)&O8o+xPdC?YQ zDqt^HhPXR;J*UIhHC5s z>n-ejTmmQr!DIxu37S7YpWiN$hx*)DI2oj?w=;UbS9UN&H7soFASx9U&MdiY_v)&8 z>i5X9W)Nq5%S=Ib{YetVn;#YLufMd7i827UFPsr0GViH{_*=t!ia1JoYA(@J0L1Jk zj-g}Bv8S?OZ!tdSWa8d3(OX&FTWC&74>P5p-hLATBM;TWj|fCv3Fx=^;(K)Beo~yz ze{WC^l|#9PJp?Ihp4nKuJsPgSq-!rIzEx|kxz8yDW6E849pwB~*|Jb4nBxeYu5=;@ zZZG53;h;q_YBGMDO0sir=i8x?j;^Lum}4q>`|q%>tiG8or?t8fYLyJ&-2FVPCxC$e zyj-A5-TV4yKH12sv-m&{-s7w%{hz}l8c{8giO3si-Z6&2y-^4VuqPC~1G1mbBow`B z14Ry1#Q%uRCEzGQ4PT;od3+m9+MWPMDvkxX-@u1@1vn>-oxM}OaYHMxGgK#TSW_?^ z#BN<0%1Qe3G%}=OlKXcsFU&((&z5FnPWQ#uiGxKP6s7t)K$NNb47)N|3r}m4TEHg_9-dxNZrCt|>ML{=L*kMlGxl{E5;eCBn(l zbX>QzPy&J|%N9v5Bua?F$r5y2TmqN7NVFvf7he9R7Htb!wEdfmrc5J5G*r^PUC`8Z zmMJ-JySp;>+;F>ey9g6orLE6lcBZ!$v7wfey~j=VXQEa7mz<@q;SIliQ&F z9;sDpk0<)Xhagw2I-Nyme0Q)a0;xVjA6bpI9U*7vnwRvek)@?qn55f`uQDi%`n&MG z!?t`5o_xwoPv?!Cxmjs^08&|>(m20YO9hkn6-(h-*7J+x0rl4q4a7aW=>UfY+NxH2 z4mB7AeYU{br?-o>8|})ais+#{DF@O?4zT#i9Fa)^Hm0DgKa04@k^ z5CZ?Q25*&C?V1MPPNQCj(JD-R#a^CrMZ5B)O0Dfw-im&`d z*HJ2k2I|ga#o|U4kd?SuZqZh?S~{1A$;+N$K{mBAE5$jc8u208Q4tBnB{#YQNgGUtq>xAv6k*9N zOHaeGrxK~F%7q-dSo4Arns$dt`lSVOis35%!+2|#Li6a*lFu$U&D}bM-dO177#6nv zG{0N#&dc8Q@!^CuK#F+))+GrKz`DKYwjcQd-^4f=>X$?)9fyd6v7LjSOKG`=_6=Ku z;j=*Da^OaT;SdlX7~b6N;b5+LF%Etf%xnM-{(}hT0w+El`33Vr+9h8QGZ66+8&46j zq8G#tE{Soo2{cH55(#E0@rAfnP@$mYt36m+Q*A-MM*4-FT;gk_YLO;G{Ane=Mt|~a zu)0fp4aO_-wcp3r#2>azzA@x$Vf@vfn z#QYtS{4F}NHMf_(?snZx27f?@KpOfXQ5KwK1Q-2;IFlD!jtkXGcI3Ml`!4r0>=%&t zMRJlqfQJGL&wq#|pO|Rmp-0#wf(qF9z0u4kduM#BSK#Rta!NfSPpL;V>U)<+34!ST zR;cM=*w#a?!p|fDaVZ`u0lmY4nL&!p5)f8DNyvbT z-a^m!QRx355Qv4I;y|J2i$;Ev(DP(0^sp2{nVw&48}pwK-Q5J?*`EQn^n3rpZDaau z>0#Z!2S>wb5%ZW+c^A6wghkB0V0P27{AjOaE5QR3MGGF=G>7>uWYFd?`-XmRTY`s) z4_kQ1gFITO#r7>w=Yl$7TbMe{$hFvEV)@gi=|iny2Bz+@hmoC-Yxv;&Xg~NB6$Gcv za0DE0o~x&rt}C|3DE51#y(4%DLl@a6hgo!wX){e-q8O_PQ-kBfd-9=?ruxgQ7VTNK z27&*S<9Hp()(;H-`u2cdj{@|^D6$v$!E%Kv(0M--szB3`mD3PaXs7_y%20uSoDZ}& z*4E5jAY$}@h8*TeJ_cjdGllBI#L`P!=I*kagKcx1L*~WbLiHo-V3`eFIjuV{xQ-?k zSXp#`k8ypoflK7S;A}VqPccUwUP*L%Q!s{)>|tQmp0thGf~w+$iQ2b8hoy!Z_^NL* zhiDbPMc*(BnE%L0@vBI&esK7t=r?`oQSfJwnSy{&HB}w`t0Rm!j0c#!7-I5*jV(Qm zZ=e)xaS3V>)#NZEl|NxuPqPB(wT)(lj*1?S!)acqJeZLMg zE^=*&DlzG|>AV96=hkI1|C%J#rN`Dee^(;qt{NXl(B-ZAtBlNr8;06Q>(>$0&u?gk z^4~}VRGa6=0KBoGmxmT*)F@_ntA+{8R5VIr&;!3|-Xp23@7***BINKuD<1V5tOq$u zWWBS5Hp7qM`Z)&qOUFar*oe7rw@R*Y%tWR%pfW%-&eTP1&?obW< z50T_I*`r^CXQ-{tBL>ek^%`wY53<#n9?6dyhhU?_a{1++Ukp#DLBGB5R3A-Wu^iK- zdIKg8^`ad8#F2E^>%cxoj(8Fh7U73Ij=;ka_-UcT_)+-BowE4H3YK#WHaaE_sF$1_ z?zmfq;)12-ohfH0?0Y7hY1f2(&n{;M_C1pfIkWio4jl8OyBm4%U+jB$mzQZzIeX;I zW8zlC-7D@scc+VOM;v_L?;JqNY5Dp=d3Q+M!%=BYmSNQ5h%=8`>~fAej|1|~h;7g; zR9(k|@i+9@kXvJe~6;?#8i2>|MNqNdWhN-FUQ9(2Ug%VDzGoEz+8U? zy1BRiP4za)9QZN5I23p=I?Ncr22qML+s;c+TPR{uvytY+gy5nCTZ&3P*YYH_AQ)U7As)Q$DwKaC97) zSxk>bZX%e-J6SpWTZmHlMVw|59S72m?#8(&{)aLXIXC4RL#`5ybyvob$Es)ivSab0 z5!v?1JGl?AnN*#WEs`DLPKk@H6Hs^ld?N*!qyZD3)cV=45F)I3IJ76bdB-X;7sL=w z8^~R7E2mGLJbAM6)ED$S*)^%0`9kIN>652Er4?ttjL!JTJl<&6=QDOuj_6*dtxbY# zr?r60nW%o3&Z~6lbXMpHm3Qrb2>$~U*JmsT=AzqZ+90_2d@nCfK#jHS4kd%Yla03OPTE=_a?#(Apj6 zYB|J_&CWkY6Xwmt-u~~iKcrKhrtd)P9Hj3VI5wM3l>zOQV8XMBS8PvQvEcd|7FZv*OOTP>G;ZyuCqoxlz6-Q?L_biqIaGBj27_ z*%j{Ip^rfM#pEs}xw2gNg_}7u5Ns6rf$VJX?kBM7Ak(h3Z3`DoA!u7|cZtz6!kPmM z%ojJq1!-T6b6~SBuEDyd31BY^5)S)y`c~RwWe?f5;&Krk3jae!aE_@T!Le~^5D}QIFK^O;KBB}G zh`hK|zxxJ>lXWiVD3FQ%0>WAed=BCp|31#=VB#-3DLfuKiFgVeIDn&%ku%LtjAH)Y zDBghS--k$obWF)3Ix_{+*i0{p8uVZt)JAk3Tn4>8aD2`sSXNUpy9BEELHczp(i_GP z)8E0}q*_wT1Sap|(49s}G}PGyW(|jG0_!3UA1@6&Yx0L|Ju&&7^!l!Tn(jHhJJ)A_ zrACoUAKYbK6T>bNq6l~dxH4866%u<5H_v*FZ^dz#eep4>o3{*Nc;Qa57L87ElHt$C9dHG85T5IWmTr{ zDxFC>-=`xcbl;_Kg3b@&;GQtwZF%a)jG%=Gb}#=m{k=`IcjCUE)sCXVM4DdZO!VxM zrxp_t_PF4*Zi*no8<~3-%b-}trFuHoKj)B=>WQktzmLn%rDrdmc;t)?J{(1&oP zFD%^`|CX`69isjXevN++nvy?U6vRQg{{64lIjzCpCr)N5xR{;UN4Ak!icq576n=P! zxj#dPvskWG46Ik=_D1zN#(bHMP_vM)P&3I_&)+|yPi9qjLW=JoUIh6w`W{GLi&t%IcuHB@j4O}$5F(_8KnY>kP4`V06c_$N`w`dhVGLDqf xlOyH5sc)8ZJR*8CV>iR1CnyH?1>|1U?o;i&)s literal 0 HcmV?d00001 diff --git a/vllm/lora/__pycache__/lora.cpython-310.pyc b/vllm/lora/__pycache__/lora.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41e14eb5f83e4474d038df58ad72fd37ec09458d GIT binary patch literal 5211 zcma)A%a0pL8Lz5-xZR$aOy-rDOdg#Dq4jRolO-$=HUySs7lNVzk>w$aRm^0qKprgN_f^{-&%?WEtG}+U{;KM$@A)go zlaoHfb^g!)-1zqlWB;O;(aXlm_i-m5gGeTM#G1Uyo1!W>y^F}IT6kKK-E^uBXFp=n zmd+lN&aSAscyna|Zw2M_!!L8c1(!WxT zjmN5IUo$_7buFw#?af-|=I#D-Ejz86Yf8#SYa%3fwCDq4RH)_ta@G2+Fwl}@5pq8Qs+i5hDw z?d^0kZe6Xsdu1hQr0Sh^t$wGrp^}w*QPc#Vt{GBkZ)aW@Hd>7|3`vNDywdeuKKOZJ zhWnzsbku~E!KgaNh`~i|3%60-n`l54h_E|Cn>gt)8>*qcYtV4WWv*B#Y2WI;M?uqU zbXAaUs=zqN+IG-XdIN?seSM<^jaEvI0fVnqp3su3WnmO&^tjLUDfAh=I<7sW1}`dVS5AaW7`{e##6EsZDkAFSfVY6RZYS`7P4vPB%8Ii((`B^m8IBd zwL57j8%@&DQ<`!@bk82s@sMq-6#5Lt(#msqVrAnO{=%1uXBnisd~DeR zu4C)c2FbWjDT_*u)F)Oy&bN2+Mw5_SPsuAZh&F1UxX}UIu>}X}PR?Si@g~;cx-_bI z5<@TIGxl#{Xzc!{)q@mXa}Qy%7AA1DZ(vkHhV(4&mEbmR@v3|2m`09{(-P%!GQQZ-LarHbJ1rEH9ViOf3x%Xt?D@TtbRN5~nY)@<^>(Lh z={IN}U!i^YZPclzYQ}o|iSZ)iQO?**RdzBfNwp?N$ec9R_06i&RXR?pl#*q60&7G` zp2LRf%BLHys=rF>e~rkIJV;q>Bzl+XTGdh&%1*Pn9h%thSC4C8Ouhp$=ldMy!_WQP zw+AgzbYJFe^W~**#7p5G^nm+C_pM_FAG;jB5O;C`#6=K|{@mwo(c?Y<#>6Z8 zRrlhT2%R}({3H(+Qsv+xI}=v?5O>lANl|*G*h6Y3+wJllL9I_qJ66w<7A!n-x74%# z#{bSZLrZLl9UIiz0bqq!pd8S7CLfK{%t5;_()LE$#mDZB+oRIqcR!-qgTthX#BsB3k$d z8qX_U-%u^}psjE0(5&Rr;PfLTV~}4{0uPl?$x9@_NDz<6*3D`U6_6zq|>9amzm<#vg#Z-dqM& zjGOECUpkkuu2GslRGKgOh|0+PxthkU@&|Avq_A@g35X&aX~BlAQ&VhnctV)~2er5G z(BB2A`s)pyqy~2A?@=#Bq{-Tu6|Zd_u}o%@KimgP5`fIO9`}$Jkht*6^TPzse=f>o zcliqpcTBM)o5=&eMk8e@ou&lk0L1?rPzwWW#BqecWO9bSnKyx)uy_{-QF?b!E(d(q zP&}@KJAc23F!?g$jU)Ru8hnhA22sz5bJ*-6Hj4Ar zxYRfHlr=nO(0-|`{f--tL>``9jnkXWcBGoBl`8p>*0Db6v}54XpD^46h%36m7(I_a zWeudzH!)3~JAOi~gFQ^_v({p4XRY_&Y=@L{^9pEY(+tP=pw4s^6s+t7D zi5wt&I#U^FnoagAW?e-Qs;1GEA@385sh=dxZ*V8n$bqQ+d;mjTz)%lx(gU3I0Vjnw z^-L@-a(`YF2M~0TYqKLSb8!ws^~q&75|=S&)u(?vDxI`}vk0Z%9JSM!5rKwg%)IM) V3A#bHH-8gMGfzzLDQoT*{|9{dH-`WK literal 0 HcmV?d00001 diff --git a/vllm/lora/__pycache__/models.cpython-310.pyc b/vllm/lora/__pycache__/models.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ec7501e4e2b1d44fecc7c9ac33471b7aae75022 GIT binary patch literal 19005 zcmb7sd5|2}d0%%=Pft(Jv9p840tbC8R2q|+4EFUd zW--U&buS2vXCqn*T9nJQEIIKd$+fT)*{nFW<0Q@~ccz|^w_Ug8ovml_w%Sf7SI>3ay4%Uu^Bu45bqe)Dr&up`O7&8w zTrZn(do%Tl)SIo(;+<*FcB=KN+nBJ16QVB<{9ros;#Gom2Hwrr}#Vcl+#?D~BZ5Aqj`)hwQ(no*UTc70LJs_IRne*cq(I-urm z8ftFasy~3(K{b!qJYuI2JERUHcG#ca&eR`7d_f&S{D^-V@g@KCs;Q3p)%7##7<#@d z8O3pbPMz?}{%MR1aJ8&z>g3IA{j4&dGZs(1!QEP3bfe6L?v2R3*lf3(m)d@0f3md_ zM((GZ!R0vqR4YLIa~ol+*KM|=?DLx&EcX134S&&$tZo<3d#d;Bg^RuJYHMxLirlB3 z{rtt|%4I)tTHVG=L3`1TWI+8*KdTIU0*fYJ>6_{n;k!z z)&54ix#Bm1%}br0+QejXon#ahDc0&L|9T_rHCiglcbeCwau9jfdMnLKjiA-{ql$k$ zM1x}~j`qV%-CfCYU6Wr0ZwF5XPoQuRLSta8o7?7&b<5bw4zlugl6UT_1~$@E)=d|s zGM_ZI%`I1Dw(_AJqU<*6ebRXG{afB3JMgfn_7z*7ebv}Oea3H@T~lSZ@?*K@26?2} z183l!HwID<@BARg_|=nPZrw%c+`w7OAe>i@%DtA^%1XMc97%JL;)b~$)K>WgV_?r2 zNhmY04;gdDnsv+EDhw>;9X1AqK|y7&Sm-6IFs7~Iz}+ehN;?J2x15wI56W>Fv|m)5 zTY1|Y_GS;tamt)=h;@qVC50n1C?)T5@-D>hReNh@P#ly7W%NC>ngwbXnkF^@*m%v? zVNe^xvRc@y$>!EVzZ>*)u;h6){4VIV;GBdQOOWB*M9n0zrh6|1XrtDwU229am)Rma zsI?RtV{QJXu?S7!5Ty+dD~u|4Rcpl`YhZ|U=W0(kueUmzomzLZbII4W-l|j|D?7qL z6z(a%U6Yg=@pLq#X?#6b`xG&FtZdE4k&VZ;((KkQH?N_(OzBy{?6DQPPzy-}IGAK1 zN2|yAU&2{$V4*pwbG7GXtyvf(Wy7!in5>+fqZfy360fGLZrB?xBpc%|UgGIhVopLI z{+zvO|LUe606G4QzG|yIop!NQdjgR)zw7HJY2oQwc)1nSUTU@5wM#xC{+?lDaDJ^B zjIBotyDaT&Xnz$o#~T5P)zX3OX=;dP?_m)d8%?Er_1;!~PfzI(rm=H;Eimiwa*f@9 zc8`#Z8^Ygn$-E~v2U9;?yR;eB9(<^VbxWwQlBV_yVt4>18~1Srq|e}8Hv9wVhG%mU z!}fGBKF?vhBoA^FYfrQ^jw}$8gC!aa%LIFxlI5;lx^lgLs2HSFcq(jbX=8 zRyVtAew4wg>$^}p%7#6?ayfG16-QG&@G#jR#9X873SxfUX~vbZLiH9adL9*J(ez=4 z76^{8v0Spl_1Pf`r088QrgsUkQa^~|bvI5@*CVqPRfkoEMbD>(R|4p2Vfny%@Yw&mQm?TYDTD!Z0x?H;z6TC{A_GM(E_ z#xiZZ?TlxZ@vhh|;;vaiy|QWVdX{zD+GQ%zZ#%Y)H2pC&zk+At#|^xQCny4hkSiWG zns^gkm-YQf1FFt6o_y+apSbW;zT@XsEwE-;<*d3Yr(E#(uAf&~-(Iy; zUU@g|ZL{t{QgDz~@NqBd^zu=f9lw<Q_;}yk0?X z2M}W4NiI_6kYeL~Fv)RP3-NixQCGr{vR3CsvH)vYucCKn)W5n*Euar8X=k#(_u6_z z9Z^Sbvh@SX!44gd^1_Vq^!E>6h;McB=no2yUK6sg6qN+OB|;9|#xcU7Vq&0=Vhfg6 zxK-(o;S3%@iX{XF=D=tGD)WjR%zPH2B)DngTgG>+ZSyPU0I`NS$N)n8#Go=U1?ew= z7izX!N|-mWb6{3_n{CCbeNFq#5WH7-8GO;%)3vo;SX<0QR!iwmphL~BA>!D9-(J-h zm@-TrLp{Mdw}F%03LA|!iUlGcI5;k*vx_6N>sX%IuMXE}DJ?uf%7K3dCoL0Zqnw_e z31iVgViYSVK5RqMQyEOsUY=;eLM=W4ACje?;BDaWHdr^lV3@{LS)13*hftsXkvsPr6@Xfh`a~LB;WDH@hKDO#MuaQ7!zuSlpW^0}6x7M+W{h5~fBUxhVzOLT13u z%hverr`h^_Y|WEX(QI#AZbrG}q90vskvF;rqQhZRv+^)WC_JU?u?r?1bPz4dVZ(-%TpMBa?DA@b&lvC7)Ewq^&j`U1q#{9u-pY8}Gw0QB-T z)+!*Uvg1IUtb{qVXAMAsal|Xz&Y<|J0l{-duMNthIy>$y^!h5=Y2;CRH4d-m*+!av zwQ`3VURaP3Ix<2pK2!R816Ns_Cf?HSy0=vwxPvN$SeCFCGaa`38g&Cu4s91#Wm zBnk*Br$oWLg5xPu2$nA zjI6L1dC*)nI)108Z)oZ}BQuPg_?RZ|OHroTy`evhOnsJsO4GWxy4j7lrX6J+Rb})96{6$uMmSD)q>gO1Jp5XHYUm$1@Tm+~W;*}K|rypWUz7@pm zPP0BQV;r9QAlZg7_A^LmfWE}4F6BI;lKwP1I7uLj$#slo_iX1^kQ}@WFgIg^#VeAj z!=HsP*g!CWxbMIsrYxajS-AO-V!FuV+uj9pxNBRy8Ieb#{<5%;l3ql*iyGE#FH;3O zDGcaT=^_}-qS-(CE=U)}e6(um6=Fd8JAeSM6E0(LpAfor=V)iVJwSSQDT`v{+6lFhW2_ET04ZdO5FoVj)$D{VRi?)IZ4lj?!f$(p`4g+ zQ8|!O_LdnGQ1aQN4TQJK-MFvJpop9or458oK0k1`&99fXW+cfl7!{WBlCFz+RRH1K!_76HY z5avNnd9QgOMuq+ZcP2(6xz(T(Ub(W-Yjwk={(+`yZh)<7K(v6~cxgT8b^8lxBJLo+ zy%aQ8{e(vJXD72RwYvQyUH_%gEf-7_uTJH${NbAF`9b&Iu!cUvCe(OI1iw1(Chc3@=CaI%WsGcL42<9~H(XAOxVEZ}*xCifO9Q8H~Iyki+?u);%5B zy$!#MCa@BE4WkiJWbuGrW%x3|1%eR~m^}aWSbZI@C+__4a*~n|14*gM;m2yoAmUO} z>zL4kIPcixIyNS1#zcvvC^A_7FRUG9)aHB|INV#l)ZUX8**V<3HK+q9eT3 zGJlKC@-VVE7lBddrO{|aULzrf2p1cTtDDVsl9OvRRBr{z86dPK=cs>-fOmm@0w8js zN7#S{!voG%BpLn?ca=p(;^=7?Jb5C~rhfp=mUTF<_IS(J~I#Qa_H27M|c$ z0M`I{uoFR{m3d-%z4|&TDj56*l(t#u%vGnv(}F+$9l`nSwF0})}K0g z>rcv__5Zwr^`=!U{;~BaN)1=l{!`WkA%2H-UDS0@w~V?T>RS2ApU9|IFrbMaFTKa{ z1ixbLm7UkKkebz$3WoXFB4pK^x zs%crX>t&{(1zOkaI;?KPd|KFS*bWc*vq_D^Y5^(Lu@uU$Hl){4tz+sgq|B+~{z1s| zC(sYAYIc2Iof@Xxt=@-}LumCdr1DwH-Wb9A)jddENZLB$gSSQPd)0kNJt`xx)S|i{ zv1951bsAQ|yVQgJakZq*pxjyY5OPlBPpHSFg%4mJAM}r)hvWY7M7qzTFIRm?eHd20 z8tbChkEn-H@1%MJEyd&gs5*y~Q|eL9TV`-xeGDmgs}msNACK*9Pm}yU-Rwefr~40I zfSO){6oF~SU?iw@Hrru~b^so;2d4%3U@6K96_1M4=E8o_2>h#1Kcz8I(!w+s_O1Z+ zMIROwggV7^WQC_7{V~i(u4R~XA$^FLAB*$GI)(fUc<(ZK@6g`KLM||m!YHOp7_5v2 zbQx<<46J7)mc!*`G~6)1?jg;=yPzP2yhgP^ao^O^zNxrk*FA-ijJ3g*XQ*CnRM@9- zq{mrNOH{)HC|gyzZA)Jb=XMSb;pwiF{a2)o)ZPP1Zf?B zY9wwSAH@jk5+oKr|AiGk)rdEIdt|oEmPMyY> zHreBz&cwKP=N`w+ZPe(GjBTkdUe;%Fm}6KSkBujWOb8gC&DfkYs_+Kc(fBgb??+HC z0=&WN1*+T4jg6iTQ-e0Tg61JuJfX>R>z7%^U1>t)-U^{ymu`c054Li2uO9)3a=q2nzz+j`o=IPpq|42q8A3!l&6ulv ze}l?%{R*S=RQqF?$q0w~_3~soOz-g&qS=P_*PxL1buzgiDm9uAldk!R{k;VDfMBGP zpKGj$=o?ElX*03DJRw)FJF)hK!xDK$pVE!jKzDj^j|jCu+?JVHc+uXFDc$ zKOjEfkc0bw&Jgh%k~fDBHy99Bi1YCR>8Fr3#w*B_eiCVC(b5kr1YB#ML8Rz%$Nu#% zu-zvCl73_R#eThg6@@;8-t;dbFeZJa6CNStp58b<689gFTYz#2v!(hn?e`2)^*0Fi z={Eu=2GdNV@d)}BO&av5hxb`-DHP>o7*PAO&-9l^fc%+UFn@pm3HCbGA2~YAK)q_I z%uOSdi-w*UP*&McR#}jbY{*B)LR�Mh!C*Uzsf!1XzDyLM25NmI-+(8#+5V1%Hio z7m{6Wkbzd(zGZ@lBL$^0gN&5U?|7p4si7rjD+m9O!oY#_<_=u_!ob}r3gOS~PzrO` z%Xnt+RBqW&5J7RaGdswz)vFhjD@vkApQd$CXbw2lUaeVs@|n-W7Q4Dit+o*I&A?YA zz9ea|cD~fqs#&?JR`oYIv=;%Qybtb$W`92>n;Xm_GE&X6 z1{Mh>NB`31n*Jm+ewN?`f@cVPfVvCXDPz@5rd?v%NEtiSz&#clK z-Qav09A<2UG$EV_Duej>ZBfBRPD}{lDHkibnLoA2QE6iJ9_4cL4)C%hWmP;j2wiN} zgH-}Vkb+p1Mm`WSnCmEnf$;T@zx%dK94{gb_w*v3fVWJ)%;Q1rIuIM?201L&c>@Xx zm}P7nv>S(0#ROAcN=QK_-U3=-HTMDL0@1JwHc9FvN=221f@h2DdyFn$ZgWnM|xtC4y*rNvW~M7MY6 zDIVbzjU7x==oM)rtwq={Zca?4Gtsg1PR40jOYd=5yu{%yCf__F>0~~Q@HBGf8_+h& zQCa0Q$o6>Hr7??6I)P!qJCFJ!95rlO94!nF3qrkduOJR#L03Vu5d+OOATY$+1U5Q0 zne`VhD3$Ed(>!3Llq8RxL>9UT8oEL^N9eLrVk0zGhz&QnXk#tqKkxk0Ca6FE^2v-?JQ9v@Uo`-Tr73LoR~_g`(KE zc-bym{iO-qjOQ)(g#Bl}ylCt1qt(d1)a$k59sV9C^VbQcW>nCE_cEzqxVg~b_K)p3 zd}Bq$abkI5R=lW((%^3+0)`<*Nz6!@hq&R8<<_BS-Oj#{-I5D=E4$?&%>k{k)^iF{ z`73yK+yNXI<*eqBlbOhoYZP@nkJa=P?T^BBWF&9(DdA7I@ zPam`LQ+qTEAwBVnNW5w5Bqy1qam#iX)E@-F-CkW{d1h>ZvlW!l`DaWF^z+j>FNCx zr$=0ZRuO;u+ym1!CKw{_AIHn{rGE+FbrWhw$T2lIRKv+F6=_EFH&}ubl}+Q^o%vah zu1WDZ`8vnP9z1J5hI&|T&kB-q%Bi2QWE-?(r%bgFre%(NpW`ar%g?2~m-nMMsfGHE zD-zs;#<_btCRtWxiVnsSjfkAjG+k)Vy(#U%$_)_+&UZ9tj**|G<;85weZkYpZ)z<^ zM&wz!Gxrk^DhWNoro_wo^TTb2iW!sJd;L$>n3yC;NKXVB+RwT2{)Wv{lU!D zER5iS!l6z76G#IfLrhuEZQI+KEvTvnCM1n>OxuBR87z?<*HIR%ID2blSqq231KES^ ziR241LVP!8c8X}HgoiBF44w*}*;`pi1aSJnQfHRyxpt><*>9?PF7XBf!z)sKiOTJv z+#Vf-mwt9?JT#F^sDT z>IELQgz(&vW?}o~e2dNb7JKz*q=?5>FQ@KpaeRsZ=)a2c-{$6@MSx5#h!!ODmJUi-iO_Yd)M$c5~qg@~=;9GT|fTgd>VKZf}T=I}zs)n&nf z!U}TYHbDFU%Q(qbc`YcU;9UrXnY1V}M#}jPmtlIU{{XH34yQ^|AXq*+R_XAkwkEdS zd>-XjSSkH+r-$hN>u6H$TdM^#;ughR`Je%)|NB6Fxn;ps;N}Cvg^v%yEGBp4Y5`)b z15X;00xjy~z>s|n#zWkbgA9;=0|rkAA0cF*8-0W!(ajdti-(M^BXRgpD0sEO2pg9~ zyZSSdGc#5K-fb|In0q zzX;rb)8)ZoRw~W1$Q!rmJ$y_*jzU6T^xtFX?-TqZ0-g^z&Th0@q5e7(ej1?ei3h8E z4W$1GQ-#3%Cd)c6$=5%zPX$efVs5K1B*#!~b~-nN#5dxCLA@+2RI?%8xxNw$Tat*c zdFZl>M4v7F48gah7km)pca^3$04=*KS7I-}%9ybE36|zji6=HZhEWz@AB6!tF1j0= zAy=X9^)|yKtbdO+DEUOi#0S3PcTwC0)#ONnSUb{Uv6Dk^iWO$}O!9Ymya&iDI@MUz z+qGyPMcB)@w;eKOVy~lUU%U<133%6n;_Z5pZV#<{w5>wJ`9nKz#k7x7v1z~MxgI@q zsRTeei3szVgs zeH2;fb%i~=Lhv~P=~eodu6dOHm|b&Sz=YfG+x>gqTX#_Cp`$E{SGUZs#d{k&`PtlA)UwkX zZZW&8|1sJYqO)hOc?aG>-&s=tVri z%K+F2;mmO}J%XDFQZjBUs6_B-Vg_jy$jfwgqUsqIQ*3Kaq}iN40T0gX)tNyyxEstZ zKjDL4yWDUED5>oMoQU+#qGrW4_MAL!OYqeYIgvqUFW+ z>1*u$C4%b&HwegTMb6Wr!PRuC)^x=c^OXK^hJKPjIPYI(=r;&{mEgMs-zRvS;B|t( zLGU*TW(aQsBcUE-7>vfg=i>O}w|PN8tyUshH&qoK+;>d#8M?nyb21yPC(lGM}Bd%Nf_H z&ReeicIvQFW*fHmcKNYtZoZ(|+X|kEpOeIbClDBYrs2zH8h#e$_>8z?#E!?X*xW4E z>2#7+IXIrgsjkW+)%_}z(Dgi7sHn(8mwwbNiyu{|)TfQr?()8Teg*>(AN$bQ9w{$q z!Uv5^Mez(+piMXm#asLzP$gyVWv7_{Dwc0F#BHe27$any=uu)Scay$R{e$D99oiV9 z%oYDelofyaJp*N38R&fo$X$w)%14wIP_o=nqKO`diUV92C-gnTH z{uKfaU;jq{_)_({IEB=HrxX0o%=|9||A}CPO@D}JjT7UZ;=>|J>4zq#Hfqe9K1XtD zI3-Pr7q2TCGc=opWjS8S6(1L0vYOJfjNuf^{d<4~cF)7;Vr5|(**%_>^gU?DGW-QM z0Y+#Se;u@+hm){7{5}YX2Hzj}H2GfQCnUa3@M|ji9&(EC{VAz3{BufbMtql$BDE`O z7PTRhpp2!e>Hx~j2+~%tH*?Y96k(@tjQ*qW=MjZ==g;)&;rFfh(i2T3s_vNAeHqi7 zvZe3w)*($1oO%R-6!{@N(Q!0OWkhUBB{Q(iGg;mEgMs-z4~Xf+=j`FX#Pq5XTkJp4G+-%Mtb7ZMLfy( zXc#?=B4W7y9rl>fe;cnD9K{sNBYeu^%n)WpxE*0?xSaZb68u+!5tfa}_9)Ozx8>1EQ13il* zZkt~HneSx8?uZXX;lPIrm1I%b--gQRd*;uO-OhoC=lb_u9RIxMU3~XMKy5VI6!Q)wOt5HSl69+dM-C&`%g+e|P2~gsY;cIqTXs(WZ)7e$B(7yzvl%$;tgV63 z(61t<*9rbL(;WM>@^(b!_>V+0>3c$|RjurMbv>dHCdB93i`Y|JU7Meg}n z@_&rPBWMG-1|2@)zxzyj){`Vv%)@xVe1U%lvrfGc9yumt-0>XCc<;EMEjz}S%bxLN ck literal 0 HcmV?d00001 diff --git a/vllm/lora/__pycache__/punica.cpython-310.pyc b/vllm/lora/__pycache__/punica.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e91031d58fc91230ba2c557d3530c25f13f9fe5b GIT binary patch literal 4393 zcmeHK&5zs06(@(3M18EXiPv_VHtsZS0jphWZKp+pO`K-!8pxu`Ch)p3D#M^>IU|WS zMJhvbwOVkEAn~>50!0t@G3Q?Tcl1(#hhB0Be9I|uoDb){AtiCw8#f7Z=pnPq$N6~k zX6EVre)AYCEYvh8&;RW|{eLcL+K1REmSfO(4W97dAh^bLPaEo8UB^1{NSAPfn|HLX z!ApYNGp!Rx$n$Lw01@P&Wq`}CVi|Eb4k@kOCy?+LB9$e-rV z+$ndf-)a64SbYhext%3`lpg~-HBL6Q*75srF0G}f)|;cq4SeQpI68La1?vQ!i|~Xm zfQYo4`dxAlJaJ2tjmY5SeLd0MBTzH4TFcyMk^2_tN}Rp>yHV`B4x^pm@*2G*q%XXX zO7TwYNG8*-lLNs zrdzS=MRdEg}k)N+09I<5%Qs>dcm z+AiNoOCjt@IAzeE7PO7?)i3=vRCX<{LySYb@0E4!s^%XvwTMhk>e^mK*LG`>kyPFX zBwW@00$A9q+^+bg>^;`UT2j7iCR(B=Bry^*Dcu7bD&h>*gn+71D4Sx){K$2}Rn??) za?QO?e{-Im%G;>41AXn-58sJ}*cGSFvHhnXWAcpC- za3EO1HofY49>%9LV18e4>iVZg9&N|Mgu`-2jz4u-2Nb%9c#!U31J&f_b?5Lg6Rv{HXv9;I5n7A0RZivJ@fD??O;s7 z5xLm;ACr`yVLgaE*YDFFLw9BSQADxPDy2sOZyWxmM&cNR4 z54T&Etm9Av#R7^Zil+)9I>j{}!V>X4>cq^+(#H53Uu^{W=mS$-BJ2VPtvJRlBnxm;&KvVU0RiO#q z3QfjclNBKPfuz+0_{qo-?X^-S7Wb|Sa+)lv;lp9J+9EF z`ZVZOcudMOM3&iqe@H~;-ueEJ1XSmK{r)!ozhRxbr~tkht}j0k)-yb-%XIBAU``!( zm}O*)%ZTwq31h$I3y)d;mV}}9m9fT*YrdH+4%2=DO;EVu5rV69nyj?<(Xl$+6@j-g zlapMz$T0UGLHU4mOFWv4L|V@7@{a(XP(J=ct0td@ihKseoNvs~-4)V=&a?!TAWSPg zFJRG&=j2aN6M3UmFR;*Y?0pu+ekxKlAtBAw6bYreJjg>9bTqM&3XZE3qQc%arYQs)3JU4kjj7{qz|S73Y|vpo4MsvTNG}7QDzB)EZIXq= zTPzIeco2AkPL?SbJ=cd??nlae4vo1h+ckxdA0lP>ks!0==?Qtyj|7>eQ{R~&r|}CE zsrZ0K0cE3+H^B;~h+)KJg!z{fc<~Sr0-sLPq_7z8aDxDWOjEkaR9yk0NU8D!Fo}E) z#ZN)pN2WQPN9FUVgfuEoqWBpKB+%~heF+`#-`MuVw(wdD--cB2t5p611!iG$dX*HL z2NJ#JlqWK1b5IHY2!j0baGEvo<7)P2=@!(@PtE$g4@g*zTc65bfuUACEyLvk9+r?) z<>$pruv@+&{4kK+S|{-PSv@seKT1pLHc@G&VsmAuuSvBg3 z_J*|AJ*@Zu{X|G}<+NXcOL=b+vMJzXJnz}>&+q;GJZ><^2#lw1|Cqf=2>BD0{gHz5 z5}xoSOo}LK$b#mS5>0;~imBw5sN{y-(449Cj0{`v!Hx`R-BmKm!pIN`T-hJHn5VF+ zNlq2X+0PI=PjsqViX9Q%hIayQ)YBuPTB?0Z@&k3CJF26*(^Pd;cAMrIXzeLeJ=F)T zrv`xGQQbRsr{A9H%SwkRf1`wC9L9EmPo3wIqY0NvE~ECNtT8xoQS&&L5vDTm20agR zR~f}88vj+$ikB8#tTg6o#J`7F&YIHdCPL7DRodAzUPcZPye`~gp{)Y1=(u%}gPVJe zaeTlf5cZQzs@TLSZHQVbzVvQYD$OOh&b8-8Ucv09TuwstvWTU#7-s49O53=NJN0hS zgpoy|LjXX$va*VE=gT!f*)Kh8Yd6kI0hNhT@kJFPRy#9pf@Okt30!W8o0X9219Qmi zPe>OsR(TIFzynLw7MLy?!uaa=_{UL$DepF^tX$~2x0f}4+$hEs(uk#u^8(IoH&BND zB;UakK8NX=oI}T45I=}iG1T6X-=PnGg>Hf#NKW3P%lheFhFx7R`15Cc1_+1DV>8y- zFVa-{mG{oy(K-#=QUq2s*GU~6Evbc~l9mXqT z7GUS)*l!wdw7jYZLM&YXQf(Q9_yht5yl)w0G^6Q#dcXd3uT@76tn1baQPpj)W94nZ z!114f@{SG+8_6||^oF?%s9@LhoL#Upaz{^wNu8C@0&o#*p*=b^HACqgl>0bgrJD)S z)omg4r4XNi+`wft1+W16etq~5E}H%4oRIkDoZ2AZ$(#}Y2*e?6GW!FN|DCcU#9W5C z5MRJ%!1CI3{p4Y?jls#VUxPLFj}Ubx#KNh{pxhVYvXW-Ib0CCr1uQ3UQ|iY;NDECV z@x}yPg-0Mme(bPym^Qme gcyIUZ)4^jc8`}RTVBI;|z<+_H1thFbSn`Da4ZM?d!vFvP literal 0 HcmV?d00001 diff --git a/vllm/lora/__pycache__/utils.cpython-310.pyc b/vllm/lora/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2acee16c5309f4861a25d80041def85974c9b11d GIT binary patch literal 1493 zcmZWpPmkL~6rUOYiQ^>OZ3T!$5c!Cut!UdHIwTOc+mXnzz?mD(LGwy1u z^wPp-;DC^JkKFk#NWOB)SKz?%-b_fg>PUX`-kbO4{obE(JsO1s)@Q%|wHSMZ{Efox z6@c(Hd};}V6HZH#;?7FCVkx7@JEgO7Q_u@EgvF}cD6|{7lCch-gXYH010myg7r6Begro@-+B2vudN+= zX|^W65q|JHdrRNY_)h*9lKLT+N@IyD?hXn6XL z3%bvXO5E3NC3y6tes*>oiDZ%Ne9&*2D}mE0%EVGGDkV-58_Ss58QzC758!h^gcLyXffGYO>FSvcM$IGjxLr0l9DD%4KQm7flX<$JOaT$nA-d# z&WywXNyB}P8&ud~#=+`*o95> zRNFMQ)KKerEmygY9XY^o_&@8ypYpO5-%44_n8|yn^579^Wgd+shp7Dow9pdlTxL3F zD~tU_FNGu=9MVI04WH66{g{pE0h33djz?XutQQNw+4=gifnAy;JYCD#QaX5lAKvQ| zFcPv0GSj{+>mYTb@xM2Jh|W=3ZXepTu1jf5S$PZvX%Q literal 0 HcmV?d00001 diff --git a/vllm/lora/__pycache__/worker_manager.cpython-310.pyc b/vllm/lora/__pycache__/worker_manager.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..295bee02088d2234e55f4dd1caeb5ccedb065f52 GIT binary patch literal 8409 zcmbtZ&2Jk?cJHokHk%YlQQwk3du-2)xtWnYyZM}TW@pCpkq?h{V~=-&-fXxnR>>ya zB<1RsJz}UJSQ}Ub$ewb@Ay~jdPR=nvfFKBR$RYnh5}=RCC42T^H%`XN@4e!OBvPK8 zq=kB_?|Sv>RlVPP?3T+#3*Xtl__sSN7cA>PsBrjmQFs?A`9~0GNjtVe`Y&R;EkeC3`nOBR%PlQi?cy`46@ohZ(nTMCt*?ry7^n-y+!Z@%|QSE@Lxj9z+3Jm(x35QyyK16kp`lfjs(RejQN3-t*X(SDI^I+s)1$6pPe) zz1!Jp-f7T=4u5o*?;<7t38JjfmR2aF9Xe7dS2{>8QcmThr@U5vt0?o*f9!_7DnOLQ zEx}@RAKBrID#;T1lv@?F%x<}|EGv(xrHYnWHK*p+ELl|xvLNTSJUK5HxOEY&i)x-{ zbLEm;=Gvt}?Fv?>bv(H$*SKXFEvE*xt~@QzaP10e4L@h)Ij&jdnlW=#47m1` zoWsGslC8ZrILptv`kvA(Xd}+vH-}9@BaV_J*y`$lMGa{N)G=t&xg|lj6Qp-l@Q`N+ zlBQJaS*aa;UhnL*>xsIbWXr?SW|ZPMWj*cQQ=KF$KIk^0&3e-8sceSNvcbobc~U)S zHdMpIX$(Kg3#98v39g*gxBFJz>I+c2?|_Pa4%F$ppsuuTSx@baT23#ZuU;gwL}Z1? zG6>YvO;o&fyaTmyucwwih5YmDS9TJ8B?hnYmAJWiWqTKT-}%P+TUW0n%~ZX)9X0Mj zVUsHl;<$}|I=W(nF;valF2iAa%k6j6&dFIsjb(Bd9Ki>QOt_Xq^>&AW~}5Hl%J7QUD3OAG85wg zS&mP=L1Ts=<%eA8b0~~Cs0qUXotNQ&B+6{)3UUEb!t3XHZ=SsVlfHOP;^FmcQ<$jGymF`A-ep#7I#FBI>shg0#|__!DPOAB@9#wM zpe0|g%WebJPP3EhIjoW8(yngYHBTx%C7Rx#%-!sEV;xYLu3jcOoy>!g$D}$(wRH1! zp2#ebDv{TSTqg2MMAnI1AyOd1iZwp$(f!o(Xr=MV`yjsM+C?F3aX`rx#TTMbwS^;W zPly-7DSaWlqHWqs2f|%M+bRl7O@InE20a!uJv`W%fWmQp?Edl!vWS}beU&L4+mbv;P3*%;F>2Vo2 zVGi(X9@e@dXW3dW058n67F$cGUk0L>lk>nHD{=v|tYVf$wFZ>21az=0SI}~5)Iznc z!3^PP;DuGp;G>4|LU@L2&Vox?&y(lmd5k_g=r@mkfqVru=j4S!A6H(Kmr!$lz~QU% zHIxFBUQq=`7_ZCAsJZYX%eBIbz!F7o*@YyHsd(hR6*R0Bq}fObYcZ41GusqL|O$=3>+zUc{UD31+ zx&gzF@^o|PVkLAdqy@BN^?gvg?}3Vb9@I(QN7j=ZFp>Z^a{Iac9QgzTbO7j6KEi%B zJ17np)%=G1CKkwY$qrz>ru$th7;||HQjHMPc$9tz-JjZ7p^hy8KGy4F0K!xKHrf+< z2)r2sgG#n*`v<*K6RI(SK>94YpZG>5jDN^q5gVVH#iqQ}{49MB^B9YHW^%PL*3pDo zj+iBqx{q`n>DkvnQbwUIAkh|ZXbUK`1@zgQJOLdu0F6PIJ_I&&6#MtJYo)r|;sw*Y`fKzMtR2 zt$gD3^CX4)SNl2C6}T>s`rhFho)wq1df1sfeU%i`}(9z z4RKfhWFnn`enJ$Fs-z4062B@Tvs{GHq8 zu}$J`ngo}hi`qKw0=;sX6RQsENLFP1;4WF|=1wPVw$+DPcXj6A%nfiL9D$MS{E)(i ze8-YH8JW8JoB|RBnkK>Xy&V=h>Y)Kn<_@7>w~jYCoHcz=GDuBN!JPWHFn>bn5{M6f zP6+Qs6*sH|)Zx8w5B!{O<4=^3sE3Tqj7J$iz{e!Xi&W!rFZr3qBF-ILZivNV-8J(OH)@E?>Q@c_<2Kd zAWh$IF0<9K_Z2PxVH_6IpQ0Z_Yd~l= z80J5e9Pk=o`{$J+DY^mN{-2103cz-e!Wswek-Y2nrr6r?LcEm;x0H|$D?Pab{M1=b z?0xG2yaI|$Lczonvbo}k%dKRCbNkLd+MbGyk7}NgHoYLmXwt(F&JFC^*l#r=CWC%V z7VM1t73vwPR3HM1*YNfadcov9KI+>}Tw(_^@qXS$X4-yWCt)*z9DpUTC)mtA+Mfd? zCt4m`QwI+{S@K%`(B}RS72_RdWsX=3Bcz#t9XNFG`VY`M^bC+azG0HkUn3LB04y>T+6TB83c=Ce zMT?OHtLZdOsNwIZTLWa;Jv!veU(#<;t1`&>f&(lM|5n!$80OHR=}-;dsYyP;TtuA9L=>P-<3G_V7v3 zrn&j>OpMFWPza_B+y+K~@6!lIfOLJ{LrT5_QiRuE&50TJXM(+a@dfbh|A?IUZ`Tvg zi+S;1o-cmx6)Hay^W^25SU~ZM;;e}T6c3hj^UKSp=JWG@#X03@nw=zliLX zke9~*;Q#xrLaT@vzzlq7{AK`;xP&MG{C{}st|Skj~^3SA5k!KqW1sbK;KT_Y6FE~rQkaK&a(d1eQI`{Px7k=es_&s>R3fd3Ru`s zoiyGJc9Ib^B;T6nfu}z7uY=|md-M(Bi!5=piNIdah!AXVrn~D#nJ?fhj8{l^bokMw z^O3G2=?_5Q5yJQSkctaLmWZ&^O5V`W-Qw#prQKXkUg{$n$hydcb&+oNKU4k3AiXzF zJht_dys3>N4=3r_$Tj(W6ppwg0&j?5mjuWSc zm=PEem`1oC7rxBR(Bz$)js6bhZj6fKeQ%)dU#b4vAicLw-uIVz4Vo^o{yic$L146R zQT|&*J|#j>C-WW>Ke}62|7O2=2m(jq5ljSx<1-1F9wx~>Njck#YRAhrQ&4v&HG*MzYPGfM0u|v#` zRq`$x{sD;t5M&la-bE z=8P_+F@Qeh3Pi||GT&j9e)n4^VXm_`lNfW#HkUU`PeeF3?llNg^1cp_j-y?r6OM<6 zGu++$9%_Q^1svH8aEw?SnfMFW}C(PnkJaczh+|k?~vy3eekVCJAgyFDoSun OO9-1+o$6bQfAqin2ccsC literal 0 HcmV?d00001 diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py new file mode 100644 index 0000000..e667d70 --- /dev/null +++ b/vllm/lora/layers.py @@ -0,0 +1,979 @@ +# pylint: disable=unused-argument +import math +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.lora.punica import add_lora, add_lora_slice, bgmv +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, + tensor_model_parallel_gather, +) +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear, + QKVParallelLinear, + MergedColumnParallelLinear) +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim + +if TYPE_CHECKING: + pass + + +def _apply_lora( + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + indices: torch.Tensor, + output: torch.Tensor, +): + """Applies lora to each input. + + This method applies all loras to each input. It uses the + indices vector to determine which lora yields the + correct output. An index of -1 means no lora should be + applied. This method adds the final lora results to the + output. + + Input shapes: + x: (batch_size, hidden_dim) + lora_a_stacked: (num_loras, lora_rank, hidden_dim) + lora_b_stacked: (num_loras, output_dim, lora_rank) + indices: (batch_size) + output: (batch_size, output_dim) + """ + org_output = output + x = x.view(-1, x.shape[-1]) + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) + return output.view_as(org_output) + + +def _apply_lora_packed_nslice( + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], +): + """Applies lora to each input. + + This method applies all loras to each input. It uses the + indices vector to determine which lora yields the + correct output. An index of -1 means no lora should be + applied. This method adds the final lora results to the + output. + + This method is used for layers that are composed of multiple sublayers + (slices) packed together. + + Input shapes: + x: (batch_size, hidden_dim) + lora_a_stacked: 3 element tuple of (num_loras, lora_rank, hidden_dim) + lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), where n is number of slices + """ + org_output = output + x = x.view(-1, x.shape[-1]) + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + offset_left = 0 + for slice_idx in range(len(output_slices)): + add_lora_slice(output, x, lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, + output_slices[slice_idx]) + offset_left += output_slices[slice_idx] + return output.view_as(org_output) + + +@dataclass +class LoRAMapping: + # Per every token in input_ids: + index_mapping: Tuple[int, ...] + # Per sampled token: + prompt_mapping: Tuple[int, ...] + + def __post_init__(self): + self.index_mapping = tuple(self.index_mapping) + self.prompt_mapping = tuple(self.prompt_mapping) + + +class BaseLayerWithLoRA(nn.Module): + + def create_lora_weights(self, max_loras: int, lora_config: LoRAConfig, + model_config: PretrainedConfig) -> None: + """Initializes lora matrices.""" + ... + + def reset_lora(self, index: int): + """Resets the lora weights at index back to 0.""" + ... + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + """Overwrites lora tensors at index.""" + ... + + def set_mapping( + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + indices_len: List[int], + ): + """Sets the mapping indices.""" + ... + + +class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): + + def __init__(self, base_layer: VocabParallelEmbedding) -> None: + super().__init__() + self.base_layer = base_layer + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> None: + + lora_vocab_start_idx = self.base_layer.org_vocab_size + weights_idx = None + if self.base_layer.vocab_end_index > lora_vocab_start_idx: + # We can start adding lora weights + weights_idx = max( + lora_vocab_start_idx - self.base_layer.vocab_start_index, 0) + self.embeddings_slice = (self.base_layer.vocab_start_index - + self.base_layer.org_vocab_size + + weights_idx, + self.base_layer.vocab_end_index - + self.base_layer.org_vocab_size) + self.embeddings_weights = self.base_layer.weight.data[weights_idx:] + self.embeddings_weights.fill_(0) + else: + self.embeddings_slice = None + self.embeddings_weights = None + + self.embeddings_tensors = torch.zeros( + ( + max_loras, + lora_config.lora_extra_vocab_size, + self.base_layer.embedding_dim, + ), + dtype=self.base_layer.weight.dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.org_vocab_size + + lora_config.lora_extra_vocab_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + self.base_layer.embedding_dim, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked_2d = self.lora_a_stacked.view( + self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], + self.lora_a_stacked.shape[2], + ) + self.indices: Optional[torch.Tensor] = None + self.indices_len: Optional[List[int]] = None + self.embeddings_indices = None + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + self.embeddings_tensors[index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + self.reset_lora(index) + self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if embeddings_tensor is not None: + self.embeddings_tensors[ + index, :embeddings_tensor.shape[0], :embeddings_tensor. + shape[1]].copy_(embeddings_tensor, non_blocking=True) + if self.embeddings_slice is not None: + # TODO(yard1): Optimize this copy, we don't need to copy + # everything, just the modified part + embeddings = self.embeddings_tensors.view( + self.embeddings_tensors.shape[0] * + self.embeddings_tensors.shape[1], + self.embeddings_tensors.shape[2] + )[self.embeddings_slice[0]:self.embeddings_slice[1]] + self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) + + def set_mapping( + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + indices_len: List[int], + ): + self.indices = base_indices + self.embeddings_indices = embeddings_indices + self.indices_len = indices_len + + def forward(self, x: torch.Tensor) -> torch.Tensor: + added_tokens_mask = x > self.base_layer.org_vocab_size - 1 + indices = self.embeddings_indices[1][:self.indices_len[3]].view_as(x) + full_lora_a_embeddings = F.embedding( + x + indices, + self.lora_a_stacked_2d, + ) + indices = self.embeddings_indices[0][:self.indices_len[3]].view_as(x) + full_output = self.base_layer.forward( + x.add_(indices * added_tokens_mask)) + + full_output_org = full_output + if full_output.ndim == 3: + full_output = full_output.view( + full_output.shape[0] * full_output.shape[1], -1) + if full_lora_a_embeddings.ndim == 3: + full_lora_a_embeddings = full_lora_a_embeddings.view( + full_lora_a_embeddings.shape[0] * + full_lora_a_embeddings.shape[1], -1) + bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) + return full_output.view_as(full_output_org) + + +class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): + + def __init__(self, base_layer: ColumnParallelLinear) -> None: + super().__init__() + self.base_layer = base_layer + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> None: + self.lora_a_stacked = torch.zeros( + max_loras, + 1, + lora_config.max_lora_rank, + self.base_layer.weight.shape[1], + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_b_stacked = torch.zeros( + max_loras, + 1, + self.base_layer.weight.shape[0], + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + + self.indices: Optional[torch.Tensor] = None + self.indices_len: Optional[List[int]] = None + self.output_dim = self.lora_b_stacked.shape[1] + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + self.reset_lora(index) + + self.lora_a_stacked[index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + + def set_mapping( + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + indices_len: List[int], + ): + self.indices = base_indices + self.indices_len = indices_len + + def apply_weights(self, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + output = self.base_layer.linear_method.apply_weights( + self.base_layer.linear_weights, x, bias) + _apply_lora( + x, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], + output, + ) + return output + + def forward(self, input_): + """Forward of ColumnParallelLinear + + Args: + input_: Tensor whose last dimension is `input_size`. + + Returns: + - output + - bias + """ + bias = (self.base_layer.bias + if not self.base_layer.skip_bias_add else None) + + # Matrix multiply. + output_parallel = self.apply_weights(input_, bias) + if self.base_layer.gather_output: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + output_bias = (self.base_layer.bias + if self.base_layer.skip_bias_add else None) + return output, output_bias + + @property + def linear_weights(self): + return self.base_layer.linear_weights + + +class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): + """ColumnParallelLinear layer that is composed of 2 sublayers (slices) + packed together (eg. gate_proj + up_proj -> gate_up_proj). + + This means we have 2 LoRAs, each applied to one half of the layer. + + Both slices must have the same size. + """ + + def __init__(self, base_layer: MergedColumnParallelLinear) -> None: + super().__init__(base_layer) + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> None: + n_slices = 2 + if not (len(self.base_layer.output_sizes) == n_slices + and self.base_layer.output_sizes[0] + == self.base_layer.output_sizes[1]): + raise ValueError( + "LoRAColumnParallelLinear2Slice requires 2 slices with " + "the same size.") + self.tp_size = get_tensor_model_parallel_world_size() + + self.lora_a_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_config.max_lora_rank, + self.base_layer.weight.shape[1], + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) for _ in range(n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + self.base_layer.weight.shape[0] // 2, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) for _ in range(n_slices)) + + self.indices: Optional[torch.Tensor] = None + self.output_dim = self.lora_b_stacked[0].shape[2] + + def reset_lora(self, index: int): + self.lora_a_stacked[0][index] = 0 + self.lora_a_stacked[1][index] = 0 + self.lora_b_stacked[0][index] = 0 + self.lora_b_stacked[1][index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + self.reset_lora(index) + + if self.tp_size > 1: + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_dim + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + lora_b = lora_b[0][:, + start_idx:end_idx], lora_b[1][:, + start_idx:end_idx] + + if lora_a[0] is not None: + self.lora_a_stacked[0][ + index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_( + lora_a[0].T, non_blocking=True) + self.lora_b_stacked[0][ + index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( + lora_b[0].T, non_blocking=True) + if lora_a[1] is not None: + self.lora_a_stacked[1][ + index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( + lora_a[1].T, non_blocking=True) + self.lora_b_stacked[1][ + index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( + lora_b[1].T, non_blocking=True) + + def apply_weights(self, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + output = self.base_layer.linear_method.apply_weights( + self.base_layer.linear_weights, x, bias) + _apply_lora_packed_nslice( + x, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], + output, + (self.output_dim, self.output_dim), + ) + return output + + +class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): + """ColumnParallelLinear layer that is composed of 3 sublayers (slices) + packed together in qkv proj fashion + (q_proj + k_proj + v_proj -> qkv_proj). + + This means we have 3 LoRAs, each applied to one slice of the layer. + + Q slice may have different shape than K and V slices (which both have + the same shape). + """ + + def __init__(self, base_layer: QKVParallelLinear) -> None: + super().__init__(base_layer) + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> None: + self.tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + self.q_proj_shard_size = (self.base_layer.num_heads * + self.base_layer.head_size) + self.kv_proj_shard_size = (self.base_layer.num_kv_heads * + self.base_layer.head_size) + self.q_shard_id = tp_rank + self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas + + # q, k, v + self.lora_a_stacked = ( + torch.zeros( + max_loras, + 1, + lora_config.max_lora_rank, + self.base_layer.weight.shape[1], + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ), + torch.zeros( + max_loras, + 1, + lora_config.max_lora_rank, + self.base_layer.weight.shape[1], + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ), + torch.zeros( + max_loras, + 1, + lora_config.max_lora_rank, + self.base_layer.weight.shape[1], + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ), + ) + self.lora_b_stacked = ( + torch.zeros( + max_loras, + 1, + self.q_proj_shard_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ), + ) + + self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size, + self.kv_proj_shard_size) + self.packed_indices: Optional[torch.Tensor] = None + self.standard_indices: Optional[torch.Tensor] = None + self.indices_len: Optional[List[int]] = None + + def reset_lora(self, index: int): + self.lora_a_stacked[0][index] = 0 + self.lora_b_stacked[0][index] = 0 + self.lora_a_stacked[1][index] = 0 + self.lora_b_stacked[1][index] = 0 + self.lora_a_stacked[2][index] = 0 + self.lora_b_stacked[2][index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + self.reset_lora(index) + + if self.tp_size > 1: + if lora_b[0] is not None: + lora_b_q = lora_b[0][:, self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + self.lora_b_stacked[0][ + index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_( + lora_b_q.T, non_blocking=True) + if lora_b[1] is not None: + lora_b_k = lora_b[1][:, self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + self.lora_b_stacked[1][ + index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_( + lora_b_k.T, non_blocking=True) + if lora_b[2] is not None: + lora_b_v = lora_b[2][:, self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + self.lora_b_stacked[2][ + index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_( + lora_b_v.T, non_blocking=True) + else: + if lora_b[0] is not None: + self.lora_b_stacked[0][ + index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( + lora_b[0].T, non_blocking=True) + if lora_b[1] is not None: + self.lora_b_stacked[1][ + index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( + lora_b[1].T, non_blocking=True) + if lora_b[2] is not None: + self.lora_b_stacked[2][ + index, 0, :lora_b[2].shape[1], :lora_b[2].shape[0]].copy_( + lora_b[2].T, non_blocking=True) + + if lora_a[0] is not None: + self.lora_a_stacked[0][ + index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_( + lora_a[0].T, non_blocking=True) + if lora_a[1] is not None: + self.lora_a_stacked[1][ + index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( + lora_a[1].T, non_blocking=True) + if lora_a[2] is not None: + self.lora_a_stacked[2][ + index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( + lora_a[2].T, non_blocking=True) + + def apply_weights(self, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + output = self.base_layer.linear_method.apply_weights( + self.base_layer.linear_weights, x, bias) + _apply_lora_packed_nslice( + x, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], + output, + self.output_slices, + ) + return output + + +class RowParallelLinearWithLoRA(BaseLayerWithLoRA): + + def __init__(self, base_layer: RowParallelLinear) -> None: + super().__init__() + self.base_layer = base_layer + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> None: + self.lora_a_stacked = torch.zeros( + ( + max_loras, + 1, + lora_config.max_lora_rank, + self.base_layer.weight.shape[1], + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + self.base_layer.weight.shape[0], + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.indices: Optional[torch.Tensor] = None + self.indices_len: Optional[List[int]] = None + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + self.reset_lora(index) + if self.base_layer.tp_size > 1: + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.base_layer.weight.shape[1] + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + lora_a = lora_a[start_idx:end_idx, :] + + self.lora_a_stacked[index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + + def set_mapping( + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + indices_len: List[int], + ): + self.indices = base_indices + self.indices_len = indices_len + + def apply_weights(self, x: torch.Tensor) -> torch.Tensor: + output = self.base_layer.linear_method.apply_weights( + self.base_layer.linear_weights, x) + _apply_lora( + x, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], + output, + ) + return output + + def forward(self, input_): + """Forward of RowParallelLinear + + Args: + input_: tensor whose last dimension is `input_size`. If + `input_is_parallel` is set, then the last dimension + is `input_size // tp_size`. + + Returns: + - output + - bias + """ + # Set up backprop all-reduce. + if self.base_layer.input_is_parallel: + input_parallel = input_ + else: + # TODO: simplify code below + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.base_layer.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + output_parallel = self.apply_weights(input_parallel) + if self.base_layer.reduce_results and self.base_layer.tp_size > 1: + output_ = tensor_model_parallel_all_reduce(output_parallel) + else: + output_ = output_parallel + + if not self.base_layer.skip_bias_add: + output = (output_ + self.base_layer.bias + if self.base_layer.bias is not None else output_) + output_bias = None + else: + output = output_ + output_bias = self.base_layer.bias + return output, output_bias + + @property + def weight(self): + return self.base_layer.weight + + +class SamplerWithLoRA(BaseLayerWithLoRA): + + def __init__( + self, + base_layer: Sampler, + hidden_size: int, + dtype: torch.dtype, + device: torch.device, + ) -> None: + super().__init__() + self.base_layer = base_layer + self.hidden_size = hidden_size + self.dtype = dtype + self.device = device + + @property + def logits_as_hidden_states(self): + return self.base_layer.logits_as_hidden_states + + @property + def vocab_size(self): + return self.base_layer.vocab_size + + @property + def org_vocab_size(self): + return self.base_layer.org_vocab_size + + @property + def include_gpu_probs_tensor(self): + return self.base_layer.include_gpu_probs_tensor + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + # Keep this in sync with csrc/punica/bgmv/bgmv_config.h + if 32000 < self.base_layer.vocab_size > 33024: + raise ValueError( + "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024" + ) + self.lora_a_stacked = torch.zeros( + ( + max_loras, + 1, + lora_config.max_lora_rank, + self.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + # Pad for kernel compatibility + math.ceil(self.base_layer.vocab_size / + lora_config.lora_vocab_padding_size) * + lora_config.lora_vocab_padding_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.embeddings_tensors = torch.full( + (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), + fill_value=float("-inf"), + dtype=self.dtype, + device=self.device, + ) + self.indices = None + self.indices_padded = None + self.indices_len = None + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + self.embeddings_tensors[index] = float("-inf") + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + ): + self.reset_lora(index) + self.lora_a_stacked[index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if embeddings_tensor is not None: + self.embeddings_tensors[ + index, :embeddings_tensor.shape[0], :embeddings_tensor. + shape[1], ] = embeddings_tensor + + def set_mapping( + self, + base_indices: torch.Tensor, + sampler_indices: torch.Tensor, + sampler_indices_padded: torch.Tensor, + embeddings_indices: torch.Tensor, + indices_len: List[int], + ): + self.indices = sampler_indices + self.indices_padded = sampler_indices_padded + self.indices_len = indices_len + + def _get_logits( + self, + hidden_states: torch.Tensor, + embedding: torch.Tensor, + embedding_bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Get the logits for the next tokens. + logits = torch.matmul(hidden_states, embedding.t()) + if embedding_bias is not None: + logits += embedding_bias + logits = tensor_model_parallel_gather(logits) + if logits is None: + return None + + lora_logits = torch.empty( + self.embeddings_tensors.shape[0] + 1, + self.embeddings_tensors.shape[1], + hidden_states.shape[0], + dtype=self.embeddings_tensors.dtype, + device=self.embeddings_tensors.device, + ) + torch.matmul(self.embeddings_tensors, + hidden_states.T, + out=lora_logits[:-1]) + lora_logits[-1] = float("-inf") + lora_logits = lora_logits.mT + lora_logits = (lora_logits.reshape( + lora_logits.shape[0] * lora_logits.shape[1], + lora_logits.shape[2], + ).index_select(0, + self.indices_padded[:self.indices_len[2]]).nan_to_num_( + nan=float("-inf"), + posinf=float("inf"), + neginf=float("-inf"))) + logits[:, + self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + + lora_logits.shape[1]] = lora_logits + + _apply_lora( + hidden_states, + self.lora_a_stacked, + self.lora_b_stacked, + self.indices[:self.indices_len[1]], + logits, + ) + + # Remove paddings in vocab (if any). + logits = logits[:, :self.base_layer.vocab_size] + + return logits + + def forward(self, *args, **kwargs): + return type(self.base_layer).forward(self, *args, **kwargs) + + +def from_layer( + layer: nn.Module, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None) -> BaseLayerWithLoRA: + supported_layer_types = { + VocabParallelEmbedding: VocabParallelEmbeddingWithLoRA, + ColumnParallelLinear: ColumnParallelLinearWithLoRA, + QKVParallelLinear: QKVParallelLinearWithLora, + MergedColumnParallelLinear: MergedColumnParallelLinearWithLoRA, + RowParallelLinear: RowParallelLinearWithLoRA, + } + for src_layer_type, lora_layer_type in supported_layer_types.items(): + if type(layer) is src_layer_type: # pylint: disable=unidiomatic-typecheck + ret = lora_layer_type(layer) + ret.create_lora_weights(max_loras, lora_config, model_config) + return ret + return layer + + +def from_layer_sampler( + layer: Sampler, + lm_head: ParallelLMHead, + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, +) -> SamplerWithLoRA: + ret = SamplerWithLoRA(layer, lm_head.embedding_dim, lm_head.weight.dtype, + lm_head.weight.device) + ret.create_lora_weights(max_loras, lora_config, model_config) + return ret diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py new file mode 100644 index 0000000..fbb228c --- /dev/null +++ b/vllm/lora/lora.py @@ -0,0 +1,160 @@ +from typing import List, Optional + +import torch +from vllm.utils import in_wsl + + +class LoRALayerWeights: + """LoRA weights for a layer composed of two low rank matrixes.""" + + def __init__( + self, + module_name: str, + rank: int, + lora_alpha: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor] = None, + scaling: Optional[float] = None, + ) -> None: + self.module_name = module_name + self.rank = rank + self.lora_alpha = lora_alpha + self.lora_a = lora_a + self.lora_b = lora_b + self.embeddings_tensor = embeddings_tensor + + if scaling is None: + self.scaling = self.lora_alpha / self.rank + else: + self.scaling = scaling + + def optimize(self) -> "LoRALayerWeights": + """Optimize the LoRA by merging the scaling into lora_b.""" + if self.scaling == 1: + return + self.lora_b *= self.scaling + self.scaling = 1 + return self + + @property + def input_dim(self) -> int: + return self.lora_a.shape[0] + + @property + def output_dim(self) -> int: + return self.lora_b.shape[1] + + @property + def is_packed(self) -> bool: + return False + + @property + def extra_vocab_size(self) -> int: + return self.embeddings_tensor.shape[ + 0] if self.embeddings_tensor is not None else 0 + + @classmethod + def create_dummy_lora_weights( + cls, + module_name: str, + input_dim: int, + output_dim: int, + rank: int, + dtype: torch.dtype, + device: torch.device, + embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights": + pin_memory = str(device) == "cpu" and not in_wsl() + lora_a = torch.zeros([input_dim, rank], + dtype=dtype, + device=device, + pin_memory=pin_memory) + lora_b = torch.zeros([rank, output_dim], + dtype=dtype, + device=device, + pin_memory=pin_memory) + embeddings_tensor = torch.rand( + 10, + embeddings_tensor_dim, + dtype=dtype, + device=device, + pin_memory=pin_memory) if embeddings_tensor_dim else None + return cls( + module_name, + rank=rank, + lora_alpha=1, + lora_a=lora_a, + lora_b=lora_b, + embeddings_tensor=embeddings_tensor, + ) + + +class PackedLoRALayerWeights(LoRALayerWeights): + """LoRA used for packed layers (eg. qkv_proj).""" + + def __init__( + self, + module_name: str, + rank: int, + lora_alphas: List[int], + lora_a: List[torch.Tensor], + lora_b: List[torch.Tensor], + scaling: Optional[List[float]] = None, + ) -> None: + super().__init__( + module_name=module_name, + rank=rank, + lora_alpha=0, + lora_a=lora_a, + lora_b=lora_b, + scaling=scaling, + embeddings_tensor=None, + ) + self.lora_alphas = lora_alphas + if scaling is None: + self.scaling = [ + lora_alpha / self.rank for lora_alpha in self.lora_alphas + ] + + @classmethod + def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights": + """Pack a list of LoRAs into a single LoRA. + + If LoRA is None, it signifies that the submodule does not have a LoRA. + """ + first_lora = next(lora for lora in loras if lora is not None) + for lora in loras: + if lora is None: + continue + lora.optimize() + rank = first_lora.rank + module_name = first_lora.module_name + obj = cls( + module_name, + rank, + [lora.lora_alpha if lora is not None else None for lora in loras], + [lora.lora_a if lora is not None else None for lora in loras], + [lora.lora_b if lora is not None else None for lora in loras], + scaling=[1 if lora is not None else None for lora in loras]) + return obj + + def optimize(self) -> "PackedLoRALayerWeights": + """Optimize the LoRA by merging the scaling into lora_b.""" + for i in range(len(self.lora_b)): + if self.scaling[i] == 1 or self.lora_b[i] is None: + continue + self.lora_b[i] *= self.scaling[i] + self.scaling[i] = 1 + return self + + @property + def input_dim(self) -> int: + raise NotImplementedError() + + @property + def output_dim(self) -> int: + raise NotImplementedError() + + @property + def is_packed(self) -> bool: + return True diff --git a/vllm/lora/models.py b/vllm/lora/models.py new file mode 100644 index 0000000..7386d21 --- /dev/null +++ b/vllm/lora/models.py @@ -0,0 +1,620 @@ +import copy +import json +import logging +import math +import os +import re +from typing import (Any, Callable, Dict, Hashable, List, Optional, Tuple, Type) + +import safetensors.torch +import torch +from torch import nn + +from vllm.config import LoRAConfig +from vllm.utils import LRUCache, in_wsl + +from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler +from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule + +logger = logging.getLogger(__name__) + +_GLOBAL_LORA_ID = 0 + + +def convert_mapping( + mapping: LoRAMapping, lora_index_to_id: List[Optional[int]], + max_loras: int, vocab_size: int, extra_vocab_size: int +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[int]]: + """Converts LoRAMapping to index tensors. + + Args: + mapping: LoRAMapping mapping rows in a batch to LoRA ids. + lora_index_to_id: List mapping LoRA ids to LoRA indices. + max_loras: Maximum number of LoRAs. + vocab_size: Model vocab size. + extra_vocab_size: Extra vocab size each LoRA can have. + + Returns: + A tuple of tensors: + base_indices: Tensor of shape [batch_size] mapping batch rows to + LoRA indices. + sampler_indices: Tensor of shape [batch_size] mapping requests to + LoRA indices for sampler. For generation, this will be the + same as base_indicies. For prefill, this will map requests + to LoRA indices. + sampler_indices_padded: Tensor of shape [batch_size] mapping + requests to LoRA indices for sampler with padding. + Same as sampler_indicies, but -1 is replaced with + max_loras. + embeddings_indices: Tensor of shape [2, batch_size] mapping + requests to embedding indices. First row is for embeddings + added by the LoRAs, second row is for the LoRA.lora_a + embeddings. + indices_len: List of lengths of the above tensors. + """ + indices = list(mapping.index_mapping).copy() + embedding_indices = indices.copy() + lora_indices = indices.copy() + prompt_mapping = [ + lora_index_to_id.index(x) if x > 0 else -1 + for x in mapping.prompt_mapping + ] + lora_idx = None + for i in range(len(indices)): + # TODO index can be slow. optimize + lora_idx = (lora_index_to_id.index(indices[i]) + if indices[i] > 0 else -1) + embedding_indices[i] = lora_idx if indices[i] > 0 else 0 + indices[i] = i + lora_indices[i] = lora_idx + + indices = torch.tensor([indices, lora_indices, embedding_indices], + dtype=torch.long, + device="cuda") + prompt_mapping = torch.tensor(prompt_mapping, + device="cuda", + dtype=torch.long) + embeddings_indices = torch.stack([ + indices[2] * extra_vocab_size, + indices[2] * (vocab_size + extra_vocab_size) + ]) + embeddings_indices[embeddings_indices == -1] = max_loras - 1 + base_indices = indices[1] + sampler_indices = prompt_mapping + sampler_indices_padded = sampler_indices.clone() + sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 + sampler_indices_padded = ( + torch.arange( + 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + + (sampler_indices_padded * len(sampler_indices_padded))) + indices_len = (base_indices.shape[-1], sampler_indices.shape[-1], + sampler_indices_padded.shape[-1], + embeddings_indices.shape[-1]) + + return (base_indices, sampler_indices, sampler_indices_padded, + embeddings_indices, indices_len) + + +def get_lora_id(): + global _GLOBAL_LORA_ID + _GLOBAL_LORA_ID += 1 + return _GLOBAL_LORA_ID + + +class LoRAModel: + """A LoRA fine-tuned model.""" + + def __init__( + self, + lora_model_id: int, + rank: int, + loras: Dict[str, LoRALayerWeights], + ) -> None: + self.id = lora_model_id + assert (lora_model_id > + 0), f"a valid lora id should be greater than 0, got {self.id}" + self.rank = rank + self.loras: Dict[str, LoRALayerWeights] = loras + + @property + def extra_vocab_size(self) -> int: + return max(lora.extra_vocab_size + for lora in self.loras.values()) if self.loras else 0 + + def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]: + """Get LoRA for a given module by name""" + return self.loras.get(module_name, None) + + # (yard1): TODO see if we can derive target_embedding_padding automatically + @classmethod + def from_lora_tensors( + cls, + lora_model_id: int, + rank: int, + lora_alpha: int, + tensors: Dict[str, torch.Tensor], + device: str = "cuda", + dtype: Optional[torch.dtype] = None, + embeddings: Optional[Dict[str, torch.Tensor]] = None, + target_embedding_padding: Optional[int] = None, + embedding_modules: Optional[Dict[str, str]] = None, + embedding_padding_modules: Optional[List[str]] = None, + ) -> "LoRAModel": + """Create a LoRAModel from a dictionary of tensors.""" + pin_memory = str(device) == "cpu" and not in_wsl() + loras: Dict[str, LoRALayerWeights] = {} + for tensor_name, tensor in tensors.items(): + module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) + if module_name not in loras: + lora_embeddings_tensor = None + if embeddings: + embeddings_module = next( + (k for k in embedding_modules if k in module_name), + None) + if embeddings_module: + lora_embeddings_tensor = embeddings[ + embedding_modules[embeddings_module]].to( + device=device, dtype=dtype) + if pin_memory: + lora_embeddings_tensor = ( + lora_embeddings_tensor.pin_memory()) + loras[module_name] = LoRALayerWeights(module_name, rank, + lora_alpha, None, None, + lora_embeddings_tensor) + if is_lora_a: + loras[module_name].lora_a = tensor.to(device=device, + dtype=dtype).t() + if pin_memory: + loras[module_name].lora_a = loras[ + module_name].lora_a.pin_memory() + else: + loras[module_name].lora_b = tensor.to(device=device, + dtype=dtype).t() + if any(name in module_name + for name in embedding_padding_modules + ) and target_embedding_padding is not None: + lora_b = loras[module_name].lora_b + assert target_embedding_padding >= lora_b.shape[1] + addition = target_embedding_padding - lora_b.shape[1] + loras[module_name].lora_b = torch.nn.functional.pad( + lora_b, (0, addition)) + if pin_memory: + loras[module_name].lora_b = loras[ + module_name].lora_b.pin_memory() + + for lora in loras.values(): + lora.optimize() + return cls(lora_model_id, rank, loras) + + @classmethod + def from_local_checkpoint( + cls, + lora_dir: str, + lora_model_id: Optional[int] = None, + device: str = "cuda", + dtype: Optional[torch.dtype] = None, + target_embedding_padding: Optional[int] = None, + embedding_modules: Optional[Dict[str, str]] = None, + embedding_padding_modules: Optional[List[str]] = None, + ) -> "LoRAModel": + """Create a LoRAModel from a local checkpoint.""" + lora_config_path = os.path.join(lora_dir, "adapter_config.json") + lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") + lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") + new_embeddings_tensor_path = os.path.join( + lora_dir, "new_embeddings.safetensors") + new_embeddings_bin_file_path = os.path.join(lora_dir, + "new_embeddings.bin") + if os.path.isfile(lora_tensor_path): + tensors = safetensors.torch.load_file(lora_tensor_path) + elif os.path.isfile(lora_bin_file_path): + tensors = torch.load(lora_bin_file_path) + else: + raise ValueError(f"{lora_dir} doesn't contain tensors") + + embeddings = None + if os.path.isfile(new_embeddings_tensor_path): + embeddings = safetensors.torch.load_file( + new_embeddings_tensor_path) + elif os.path.isfile(new_embeddings_bin_file_path): + embeddings = torch.load(new_embeddings_bin_file_path) + + with open(lora_config_path) as f: + config = json.load(f) + rank = config["r"] + lora_alpha = config["lora_alpha"] + return cls.from_lora_tensors( + lora_model_id=get_lora_id() + if lora_model_id is None else lora_model_id, + rank=rank, + lora_alpha=lora_alpha, + tensors=tensors, + device=device, + dtype=dtype, + embeddings=embeddings, + target_embedding_padding=target_embedding_padding, + embedding_modules=embedding_modules, + embedding_padding_modules=embedding_padding_modules, + ) + + +class LoRAModelManager: + """A manager that manages multiple LoRA-fine-tuned models.""" + + def __init__( + self, + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + ): + """Create a LoRAModelManager and adapter for a given model. + + Args: + model: the model to be adapted. + max_num_seqs: the maximum number of sequences model can run in a + single batch. + max_num_batched_tokens: the maximum number of tokens model can run + in a single batch. + vocab_size: the vocab size of the model. + lora_config: the LoRA configuration. + """ + self.lora_config = lora_config + self.max_num_seqs = max_num_seqs + assert self.capacity >= self.lora_slots + self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 + self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots + self.vocab_size = vocab_size + self.base_indices = torch.empty(self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") + self.sampler_indices = torch.empty(self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") + self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") + self.embeddings_indices = torch.empty(2, + self.max_num_batched_tokens, + dtype=torch.long, + device="cuda") + self.offsets = [] + # 4 is the number of indicies tensors defined above + # base_indices, sampler_indices, sampler_indices_padded, + # embeddings_indices + self.indices_len = [None] * 4 + + self.model: nn.Module = model + if hasattr(self.model, "supported_lora_modules"): + self.supported_lora_modules = copy.deepcopy( + self.model.supported_lora_modules) + self.packed_modules_mapping = copy.deepcopy( + self.model.packed_modules_mapping) + self.packed_modules: Dict[str, List[str]] = {} + self.modules: Dict[str, "BaseLayerWithLoRA"] = {} + self._registered_loras: Dict[int, LoRAModel] = {} + # Dict instead of a Set for compatibility with LRUCache. + self._active_loras: Dict[int, None] = {} + self._last_mapping = None + self._create_lora_modules() + self.model.lora_manager = self + + @property + def capacity(self) -> int: + return self.lora_config.max_cpu_loras + + @property + def lora_slots(self) -> int: + return self.lora_config.max_loras + + def __len__(self) -> int: + return len(self._registered_loras) + + def activate_lora( + self, + lora_id: int, + ) -> bool: + """Move LoRA into a GPU buffer to be used in the forward pass.""" + if lora_id in self._active_loras: + return False + first_free_slot = next( + ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id) + if lora_id is None), None) + if first_free_slot is None: + raise ValueError("No free lora slots") + index, _ = first_free_slot + self._active_loras[lora_id] = None + lora_model = self._registered_loras[lora_id] + logger.debug( + f"Activating LoRA. int id: {lora_model.id}, slot index: {index}") + self.lora_index_to_id[index] = lora_model.id + for module_name, module in self.modules.items(): + module_lora = lora_model.get_lora(module_name) + if module_lora: + module_lora.optimize() + module.set_lora(index, module_lora.lora_a, module_lora.lora_b, + module_lora.embeddings_tensor) + else: + module.reset_lora(index) + return True + + def _deactivate_lora(self, lora_id: int): + try: + index = self.lora_index_to_id.index(lora_id) + self.lora_index_to_id[index] = None + except ValueError: + pass + + def deactivate_lora(self, lora_id: int) -> bool: + """Remove a LoRA from a GPU buffer.""" + if lora_id in self._active_loras: + self._deactivate_lora(lora_id) + self._active_loras.pop(lora_id) + return True + return False + + def _add_lora(self, lora: LoRAModel) -> bool: + self._create_merged_loras_inplace(lora) + self._registered_loras[lora.id] = lora + + def add_lora(self, lora: LoRAModel) -> bool: + """Add a LoRAModel to the manager CPU cache.""" + if lora.id not in self._registered_loras: + if len(self._registered_loras) >= self.capacity: + raise RuntimeError("No free LoRA slots.") + self._add_lora(lora) + return True + return False + + def remove_lora(self, lora_id: int) -> bool: + """Remove a LoRAModel from the manager CPU cache.""" + # TODO: should we check active lora? + self.deactivate_lora(lora_id) + return bool(self._registered_loras.pop(lora_id, None)) + + # TODO see if this can be vectorized + def _set_lora_mapping(self, mapping: LoRAMapping) -> None: + (base_indices, sampler_indices, sampler_indices_padded, + embeddings_indices, + indices_len) = convert_mapping(mapping, self.lora_index_to_id, + self.lora_slots + 1, self.vocab_size, + self.lora_config.lora_extra_vocab_size) + self.base_indices[:base_indices.shape[0]].copy_(base_indices) + self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) + self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( + sampler_indices_padded) + self.embeddings_indices[:embeddings_indices. + shape[0], :embeddings_indices.shape[1]].copy_( + embeddings_indices) + # Maintain the reference + self.indices_len[:] = indices_len + + def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: + if self._last_mapping != lora_mapping: + self._set_lora_mapping(lora_mapping) + self._last_mapping = lora_mapping + + def list_loras(self) -> Dict[int, LoRAModel]: + """List all registered LoRAModels.""" + return dict(self._registered_loras) + + def get_lora(self, lora_id: int) -> Optional[LoRAModel]: + return self._registered_loras.get(lora_id, None) + + def remove_all_loras(self) -> bool: + """Remove all LoRAModels from the manager.""" + self._registered_loras.clear() + self.lora_index_to_id = [None] * self.lora_slots + self._active_loras.clear() + + def _create_lora_modules(self): + for module_name, module in self.model.named_modules(): + if not self._match_target_modules(module_name): + continue + + new_module = replace_submodule( + self.model, module_name, + from_layer(module, self.lora_slots, self.lora_config, + self.model.config)) + # (yard1): TODO make this more robust + if "lm_head" in module_name: + sampler_module = self.model.get_submodule("sampler") + new_module = replace_submodule( + self.model, "sampler", + from_layer_sampler(sampler_module, module, self.lora_slots, + self.lora_config, self.model.config)) + self.register_module(module_name, new_module) + self._register_packed_modules(module_name) + new_module.set_mapping(self.base_indices, self.sampler_indices, + self.sampler_indices_padded, + self.embeddings_indices, self.indices_len) + + def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): + assert isinstance(module, BaseLayerWithLoRA) + self.modules[module_name] = module + + def create_dummy_lora( + self, + lora_id: int, + rank: int, + embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel: + """Create zero-initialized LoRAModel for warmup.""" + model = LoRAModel(lora_id, rank, {}) + for module_name, module in self.model.named_modules(): + if not self._match_target_modules(module_name) or not isinstance( + module, BaseLayerWithLoRA): + continue + parts = module_name.split(".") + if module_name not in self.packed_modules: + if parts[-1] in embedding_modules: + input_dim = (module.base_layer.org_vocab_size + + self.lora_config.lora_extra_vocab_size if + hasattr(module.base_layer, "org_vocab_size") + else module.base_layer.weight.shape[1]) + output_dim = module.base_layer.embedding_dim if hasattr( + module.base_layer, + "embedding_dim") else module.base_layer.weight.shape[0] + embeddings_tensor_dim = (module.base_layer.embedding_dim if + hasattr(module.base_layer, + "embedding_dim") else + module.base_layer.weight.shape[1]) + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name, + input_dim, + output_dim, + rank, + module.lora_a_stacked.dtype, + "cpu", + embeddings_tensor_dim=embeddings_tensor_dim) + else: + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name, + module.lora_a_stacked.shape[-1], + module.lora_b_stacked.shape[-2], + rank, + module.lora_a_stacked.dtype, + "cpu", + ) + lora.optimize() + else: + parts = module_name.split(".") + replacements = self.packed_modules_mapping[parts[-1]] + subloras = [] + for i, r in enumerate(replacements): + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name + "." + r, + module.lora_a_stacked[i].shape[-1], + module.lora_b_stacked[i].shape[-2], + rank, + module.lora_a_stacked[i].dtype, + "cpu", + ) + lora.optimize() + subloras.append(lora) + lora = PackedLoRALayerWeights.pack(subloras) + model.loras[module_name] = lora + return model + + def _match_target_modules(self, module_name: str): + return any( + re.match( + r".*\.{target_module}$".format(target_module=target_module), + module_name) or target_module == module_name + for target_module in self.supported_lora_modules) + + def _register_packed_modules(self, module_full_name: str) -> None: + parts = module_full_name.split(".") + module_name = parts[-1] + replacements = self.packed_modules_mapping.get(module_name) + if not replacements: + return + prefix = ".".join(parts[:-1]) + self.packed_modules[module_full_name] = [ + prefix + "." + r if prefix else r for r in replacements + ] + + def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: + for module_name, new_module_names in self.packed_modules.items(): + replacement_loras = [] + has_replacement = False + for r in new_module_names: + lora = lora_model.get_lora(r) + replacement_loras.append(lora) + if lora: + has_replacement = True + if not has_replacement: + continue + for i in range(len(replacement_loras)): + if replacement_loras[i]: + continue + replacement_loras[i] = None + lora_model.loras[module_name] = PackedLoRALayerWeights.pack( + replacement_loras) + + +class LoRALRUCache(LRUCache): + + def __init__(self, capacity: int, deactivate_lora_fn: Callable[[Hashable], + None]): + super().__init__(capacity) + self.deactivate_lora_fn = deactivate_lora_fn + + def _on_remove(self, key: Hashable, value: Any): + logger.debug(f"Removing LoRA. int id: {key}") + self.deactivate_lora_fn(key) + return super()._on_remove(key, value) + + +class LRUCacheLoRAModelManager(LoRAModelManager): + """A model manager that manages multiple LoRAs with LRU cache.""" + + def __init__( + self, + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + ): + super().__init__(model, max_num_seqs, max_num_batched_tokens, + vocab_size, lora_config) + self._registered_loras: LoRALRUCache = LoRALRUCache( + self.capacity, self.deactivate_lora) + self._active_loras: LoRALRUCache = LoRALRUCache( + self.lora_slots, self._deactivate_lora) + + def list_loras(self) -> Dict[int, LoRAModel]: + """List all registered LoRAModels.""" + return dict(self._registered_loras.cache) + + def add_lora(self, lora: LoRAModel) -> bool: + """Add a LoRAModel to the manager.""" + if lora.id not in self._registered_loras: + self._add_lora(lora) + was_added = True + else: + # We always touch to update the LRU cache order + self._registered_loras.touch(lora.id) + was_added = False + return was_added + + def activate_lora( + self, + lora_id: int, + ) -> bool: + if lora_id not in self._active_loras and len( + self._active_loras) >= self.lora_slots: + self._active_loras.remove_oldest() + result = super().activate_lora(lora_id) + # We always touch to update the LRU cache order + self._active_loras.touch(lora_id) + return result + + def remove_oldest_lora(self) -> bool: + if len(self._registered_loras) > 0: + self._registered_loras.remove_oldest() + return True + return False + + +def create_lora_manager( + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, + **kwargs) -> LoRAModelManager: + """Create a LoRA adapter for a given model.""" + if not hasattr(model, "supported_lora_modules"): + raise ValueError(f"Model {type(model)} is not supported for LoRA.") + lora_manager = lora_manager_cls( + model=model, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + vocab_size=vocab_size, + lora_config=lora_config, + **kwargs) + return lora_manager diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py new file mode 100644 index 0000000..fc74269 --- /dev/null +++ b/vllm/lora/punica.py @@ -0,0 +1,170 @@ +# Based on code from https://github.com/punica-ai/punica + +from typing import Optional + +import torch + + +def _raise_import_error(e): + if torch.cuda.get_device_capability() < (8, 0): + raise ImportError( + "punica LoRA kernels require compute capability >= 8.0") from e + else: + raise ImportError( + "punica LoRA kernels could not be imported. If you built vLLM " + "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " + "was set.") from e + + +def bgmv( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight + matrices. + indicies: Shape: `[B]`. Indices of the weight matrices. + layer_idx: Layer index of the weight matrices. + scale: Scaling factor. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) + + +def add_lora(y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + *, + buffer: Optional[torch.Tensor] = None): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + buffer: Optional. Shape: `[B, R]`. Temporary buffer. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) + punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, + scale) + + +def add_lora_slice(y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + *, + buffer: Optional[torch.Tensor] = None): + """ + Same as `add_lora` but you can operate on slices of y. + Pass whole y, define y_offset and y_slice_size. + + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + y_offset: Offset to apply to the starting column of y. + y_slice_size: Size of the y column slice. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv_low_level( + buffer, + x, + wa_t_all, + indicies, + layer_idx, + 1.0, + x.size(1), + buffer.size(1), + 0, + ) + punica_kernels.dispatch_bgmv_low_level( + y, + buffer, + wb_t_all, + indicies, + layer_idx, + scale, + buffer.size(1), + y_slice_size, + y_offset, + ) diff --git a/vllm/lora/request.py b/vllm/lora/request.py new file mode 100644 index 0000000..bbbf488 --- /dev/null +++ b/vllm/lora/request.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass + + +@dataclass +class LoRARequest: + """ + Request for a LoRA adapter. + + Note that this class should be be used internally. For online + serving, it is recommended to not allow users to use this class but + instead provide another layer of abstraction to prevent users from + accessing unauthorized LoRA adapters. + + lora_int_id must be globally unique for a given adapter. + This is currently not enforced in vLLM. + """ + + lora_name: str + lora_int_id: int + lora_local_path: str + + def __post_init__(self): + if self.lora_int_id < 1: + raise ValueError( + f"lora_int_id must be > 0, got {self.lora_int_id}") + + def __eq__(self, value: object) -> bool: + return isinstance( + value, LoRARequest) and self.lora_int_id == value.lora_int_id + + def __hash__(self) -> int: + return self.lora_int_id diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py new file mode 100644 index 0000000..f67a381 --- /dev/null +++ b/vllm/lora/utils.py @@ -0,0 +1,39 @@ +import logging +from typing import Tuple + +from torch import nn + +logger = logging.getLogger(__name__) + + +def replace_submodule(model: nn.Module, module_name: str, + new_module: nn.Module) -> nn.Module: + """Replace a submodule in a model with a new module.""" + parent = model.get_submodule(".".join(module_name.split(".")[:-1])) + target_name = module_name.split(".")[-1] + setattr(parent, target_name, new_module) + return new_module + + +def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: + """Parse the name of lora weights. + + args: + name: the name of the fine-tuned LoRA, e.g. + base_model.model.dense1.weight + return: + Tuple(module_name, is_lora_a): + module_name: the name of the module, e.g. model.dense1, + is_lora_a whether the tensor is lora_a or lora_b. + """ + parts = name.split(".") + assert parts[0] == "base_model" + assert parts[1] == "model" + if parts[-1] == "weight": + assert parts[-2] == "lora_A" or parts[-2] == "lora_B" + return ".".join(parts[2:-2]), parts[-2] == "lora_A" + + if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": + return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A" + + raise ValueError(f"{name} is unsupported format") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py new file mode 100644 index 0000000..7e92bc9 --- /dev/null +++ b/vllm/lora/worker_manager.py @@ -0,0 +1,238 @@ +import logging +from abc import ABC, abstractmethod, abstractproperty +from typing import Any, Dict, List, Optional, Set, Type + +import torch + +from vllm.lora.models import (LoRAModel, LoRAModelManager, + LRUCacheLoRAModelManager, create_lora_manager) +from vllm.lora.request import LoRARequest +from vllm.lora.layers import LoRAMapping +from vllm.config import LoRAConfig + +logger = logging.getLogger(__name__) + + +class AbstractWorkerLoRAManager(ABC): + """Abstract class for managing LoRA models on the worker side.""" + + def __init__(self, max_num_seqs: int, max_num_batched_tokens: int, + vocab_size: int, lora_config: LoRAConfig, + device: torch.device): + self.max_num_seqs = max_num_seqs + self.max_num_batched_tokens = max_num_batched_tokens + self.vocab_size = vocab_size + self.device = device + self.lora_config = lora_config + + @abstractproperty + def is_enabled(self) -> bool: + ... + + @abstractmethod + def create_lora_manager( + self, + model: torch.nn.Module, + ) -> Any: + ... + + @abstractmethod + def set_active_loras(self, lora_requests: List[LoRARequest], + lora_mapping: LoRAMapping) -> None: + ... + + @abstractmethod + def add_lora(self, lora_request: LoRARequest) -> bool: + ... + + @abstractmethod + def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: + ... + + @abstractmethod + def remove_lora(self, lora_id: int) -> bool: + ... + + @abstractmethod + def remove_all_loras(self) -> bool: + ... + + @abstractmethod + def list_loras(self) -> Set[int]: + ... + + +class WorkerLoRAManager(AbstractWorkerLoRAManager): + """WorkerLoRAManager that manages LoRA models on the worker side. + + Every request, the requested LoRAs will be loaded (unless they are already + loaded), and every other LoRA will be unloaded.""" + + _lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager + + def __init__( + self, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + device: torch.device, + embedding_modules: Dict[str, str], + embedding_padding_modules: List[str], + lora_model_cls: Type[LoRAModel] = LoRAModel, + ): + self._lora_manager: Optional[LoRAModelManager] = None + self._lora_model_cls = lora_model_cls + self.embedding_modules = embedding_modules + self.embedding_padding_modules = embedding_padding_modules + super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size, + lora_config, device) + + @property + def is_enabled(self) -> bool: + return True + + def create_lora_manager( + self, + model: torch.nn.Module, + ) -> Any: + lora_manager = create_lora_manager( + model, + max_num_seqs=self.max_num_seqs, + max_num_batched_tokens=self.max_num_batched_tokens, + vocab_size=self.vocab_size, + lora_config=self.lora_config, + lora_manager_cls=self._lora_manager_cls, + ) + self._lora_manager: LoRAModelManager = lora_manager + return lora_manager.model + + def set_active_loras(self, lora_requests: List[LoRARequest], + lora_mapping: LoRAMapping) -> None: + self._apply_loras(lora_requests) + self._lora_manager.set_lora_mapping(lora_mapping) + + def _apply_loras(self, lora_requests: List[LoRARequest]) -> None: + loras_that_exist = self.list_loras() + loras_map = { + lora_request.lora_int_id: lora_request + for lora_request in lora_requests if lora_request + } + if len(loras_map) > self._lora_manager.lora_slots: + raise RuntimeError( + f"Number of requested LoRAs ({len(loras_map)}) is greater " + "than the number of GPU LoRA slots " + f"({self._lora_manager.lora_slots}).") + + new_loras = set(loras_map) + loras_to_add = new_loras - loras_that_exist + loras_to_remove = loras_that_exist - new_loras + + for lora_id in loras_to_remove: + self.remove_lora(lora_id) + + for lora_id in loras_to_add: + self.add_lora(loras_map[lora_id]) + + def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: + try: + lora = self._lora_model_cls.from_local_checkpoint( + lora_request.lora_local_path, + lora_model_id=lora_request.lora_int_id, + device="cpu", + dtype=self.lora_config.lora_dtype, + target_embedding_padding=self.vocab_size + + self.lora_config.lora_extra_vocab_size, + embedding_modules=self.embedding_modules, + embedding_padding_modules=self.embedding_padding_modules, + ) + except Exception as e: + raise RuntimeError( + f"Loading lora {lora_request.lora_local_path} failed") from e + if lora.rank > self.lora_config.max_lora_rank: + raise ValueError( + f"LoRA rank {lora.rank} is greater than max_lora_rank " + f"{self.lora_config.max_lora_rank}.") + if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: + raise ValueError( + f"LoRA added vocab size {lora.extra_vocab_size} is greater than " + f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}." + ) + return lora + + def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: + if lora_request.lora_int_id in self.list_loras(): + return False + return self._lora_manager.add_lora( + self._lora_manager.create_dummy_lora(lora_request.lora_int_id, + rank, self.embedding_modules)) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if lora_request.lora_int_id in self.list_loras(): + return False + lora = self._load_lora(lora_request) + loaded = self._lora_manager.add_lora(lora) + self._lora_manager.activate_lora(lora.id) + return loaded + + def remove_lora(self, lora_id: int) -> bool: + return self._lora_manager.remove_lora(lora_id) + + def remove_all_loras(self) -> bool: + self._lora_manager.remove_all_loras() + + def list_loras(self) -> Set[int]: + return set(self._lora_manager.list_loras()) + + +class LRUCacheWorkerLoRAManager(WorkerLoRAManager): + """WorkerLoRAManager that manages LoRA models on the worker side. + + Uses an LRU Cache. Every request, the requested LoRAs will be loaded + (unless they are already loaded) and least recently used LoRAs will + be unloaded if the cache is above capacity.""" + + _lora_manager_cls: Type[ + LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager + + def create_lora_manager( + self, + model: torch.nn.Module, + ) -> Any: + lora_manager = create_lora_manager( + model, + lora_manager_cls=self._lora_manager_cls, + max_num_seqs=self.max_num_seqs, + vocab_size=self.vocab_size, + lora_config=self.lora_config, + max_num_batched_tokens=self.max_num_batched_tokens, + ) + self._lora_manager: LRUCacheLoRAModelManager = lora_manager + return lora_manager.model + + def _apply_loras(self, lora_requests: List[LoRARequest]) -> None: + loras_map = { + lora_request.lora_int_id: lora_request + for lora_request in lora_requests if lora_request + } + if len(loras_map) > self._lora_manager.lora_slots: + raise RuntimeError( + f"Number of requested LoRAs ({len(loras_map)}) is greater " + "than the number of GPU LoRA slots " + f"({self._lora_manager.lora_slots}).") + for lora in loras_map.values(): + self.add_lora(lora) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if lora_request.lora_int_id not in self.list_loras(): + # Remove before we load the new lora to save memory + if len(self._lora_manager) + 1 > self._lora_manager.capacity: + self._lora_manager.remove_oldest_lora() + lora = self._load_lora(lora_request) + loaded = self._lora_manager.add_lora(lora) + else: + # If the lora is already loaded, just touch it to + # update its position in the caches + loaded = self._lora_manager.get_lora(lora_request.lora_int_id) + self._lora_manager.activate_lora(lora_request.lora_int_id) + return loaded diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py new file mode 100644 index 0000000..17a33ee --- /dev/null +++ b/vllm/model_executor/__init__.py @@ -0,0 +1,10 @@ +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_random_seed, get_model + +__all__ = [ + "InputMetadata", + "get_model", + "SamplingMetadata", + "set_random_seed", +] \ No newline at end of file diff --git a/vllm/model_executor/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..623d3f56eb5ef9b5fd44e200cb085b498f029269 GIT binary patch literal 440 zcmZuuJxj$f6ixcAwD?vB7tx`EjiRH7+rvSZw`B>j2{m}rmL?V7FL8JC7YSXR{0mMd z<>3ds3pw0-a&FFnbVad?Tphf;H%Azu4?p~^zzx^#maaN9aySD5Jrr8DbBihseQQnbE^10A0-@CZAM)fk4+d;t+I?&{d&qS-Q@`K%KZ#7nS&hVc!Bfa8$m33t^fc4 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/__pycache__/guided_decoding.cpython-310.pyc b/vllm/model_executor/__pycache__/guided_decoding.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90dc29c674fce82ebb74ee3d68edbc8ca9bf7730 GIT binary patch literal 2930 zcmZuzOOG4J5uWaOa5yA)DQcgNWHk`O7U4DNAV?4(f?#L8krl+0t(B8tY-2Jz-OHWj zJgDv-OI&b)#H*_hIpq)pkdC?LFXWngfVt+Rzo3JaswQ{!$`C!QuCA`GuKw!lDLb9O zg6HaA|1R`ZTk1JG2Sv?99pC&^5A?d3hsj z7}?GIycq^Y_Oe#q4m(C}WGnebshmc?7V{m9~N;hx#yHYf30ydze{npi)i+ZJB|c~fkmeCY61-hJeRKYGogE3Sy` zrNt%Q1BKk1iKn7ZYxm0{EsNT{S4}fCY5po!;!U_qu z;0#GNUS%k^qv(DWXUh=>L$2Msd$;!fP-^O+jabc!BrWSMdX`mDKp;_8meYC_dQn$$=*Luc@}Rb7-GY)zvO1XXHDhKP50uH0b3u?ahg)2YO11z~~*&_cheQ zaRbngmi}?`knW@JKz?j|2bzM{UxHVlTH`eUtJc!0ExEC3>ke3rzzXW(6z4r7*PoI3Gt%$V(Wu`ITLz*$GL#>rDlGuaMIz)7eAf=9 zN;Xk!-kVLuo|JF{1!x?Zd#~<9kxIu!j1?C}hIi#U*oW<jA6|(Kuk#0GY^GE`0%15lP7%(K`;)6*7L%fUP`ttJNEQsp1TQv|O4&vUGicHz z;nU5kgWttjwK$Kyy@>lgw7}qBgN=L_bYDSJ`1yxvStFPtqwimnKL%~xdqJRsC4Wi= z1BT{mN5Gel#2nPk!>UL$P^ywhA?CCsLSFs?hp7}2VJQdL^Ym;Ar6cj8m3Tpi{&dFU zLZ^vARmkv`9{_91*%WY~)u4nMF-`}-v${;mOk&Kkg;RHo?W6`t(lq&-*!@d3uSOvarZX#i-pTo8Sm=u>6z&nKjJuXvdNNG1cVWradsC0f)F+&h7EF@DBgh72zA=sHFi7w zn60YuOYI8|Y%ZKRAdqv+U&9fpublSAd8OU(`>O5n*qaqd)TnBzzIuJH`h8y)Q&X;i z^6p>$xqfBNF#bh@$zo&h5vu$Xn%Ib0YWVd(^BKlwYG#&iF``+ioi+SM=J-z5^qX1B zZ)I)2t>;+k`YzVkX(#LYT}^AGQ`xjXt@}O}!6lW!LR;B30FTG*nm+ zrr6s4MJV~hBIYz`J>lzo=a;Jw(O*~_C21T8q67m`WPyw}cos%<>11&*_y|>gjD{Pe z4&RI!w+3cx#?}klw_`hQyfFMm>~M!SFGDom+HA)yY6p%M&4yY!aTlvP@f3ucuI4o} z%f))4#4aDy`B8 zz#VQd_x^pa$c9PEJ+;9-NR9tcd9g^gxYRU#*G=3tiquH(*jwj07a=LH@4DVmc^rx` z+TF_BKV>(#>4w+lWLT=HB=FZ8fyl!n6NJ4BSYcDz#+&v~BgaWSUQ zS)V|VjEs2$0*qITm#p-xY9@@sp$f*cq77A5)<9-GyRs!u2ck=q_Q{HAj9(k#9R9xe z?czv^#k7dRbTLiV7KgiPqsTw(zkh2{CW_x2hSBqIoy*0oG|d(@F9F}-(MT0yaeb7; zJPuNA(_naPRM{WyR;?gN@{GfllI$~IPT&QoNY8N1` zm7&-Hn=tu14{%2krqJBj>QaS@%RZDNXme#HxvJVwNT6qWOw53+#jhIxgi*PgTfO2v zj3sTIHutW*mD`iGRXfRtBNZgEtQ;8?uxsTA4)@AWd2y@z;n}5Vu?`V|9rhQ1QQ#LLerTxm(ed{(}31*u0 z=(2&=;wZez(%f|Ev;k9{&29(pXmcvI|7`6yV^%gx2m8#!>9YBXG4$}78hdx1$`hTc z!dpwi`W4mY_SX0S(t0w?hAG~zR)5ZSRPCsJLB{+14GJNe6Z5Sk;=P&5-U#JJ)lj2h z$}217xwwdhf~@Piw7<5JxIlz+VVV}(a53$?TFDJKUbIo!gHf7R4Y?5xd1Ysz+8{d* z#1-0RhjASATD6@Q==e)Sb|n84XZUU!%A*(KcXa<~V*1o|lFpUS&pHYm8;>Oof?GIK zdT6?K*K}FSoM#Rk+dg*ITy zd}(Z22Mm2XW~vdJsxh!XgH7QZ_KWVmSz<-tyfnyu;RP7rE@6YWwY>ar5)_EAxwp-|oCAijZ6nO%BjnB!KqDMV0(UZ;Ek;_ia+#5 zOZ4Q066pb6?d8}m%?0Saemg7L`Qi4ntB)Tn-CGKtuH0XGvix9WY0tTRYv+SocfMd= z&-NE4b<-apbOj@oq*B}f2mYm~$hWvS@>Gd8fTuxWvT^}aLdoq!Z2;uHi^X~e07%cj zW@46T1n@$G+rp`%PEK0I3Q3t#mvzlKt3}Sb^j5?>6J`3hb;yu+`rmMJuj#kfxeB%r zS=M}4|55DQtt`ovKSwV}4tWIZt{=Dbr%#WuPzga;3>h_IYoUlxOZI*0k$7SX4P`4@ z6t~r77fq)FQNI1Zh_ce_Qn1Pq^Mjyrg8=?Af+d2g8w8(@!nB?deY%NUqXsOBXkwn4 zYiNAu6P_cV5#J#S@iPG%V>^frCJ3Pin-)AFZ5;yDJp5C=IOU zwl@&(^_vTZp+fqTM+8L2g}4|4(+&p8D47B%At0GzMp=h;$qpN@DCF4rH9hNK=AC** zQ|C3c3F`HdR&`Yi;fMVd`S#ze*jhjyy?_As2d3JkU0O)+8p=H|4m)K7iJys#x&cX? z@E?G-|CZ~Kus7=j<^M_ZYVC{>h?26q9-ZygGd14vJVKUMhevpAon8{WZHI)4$5|X< z)KSXG$48Dqbxfg4-k$l<43)3j>eS}Pwf*Qi!g)f%n&sVGg^^04i37av#RZr74do>n z09PGnuPMAigsJSEI(cb_!ae_qrz53T9I2vXnsgQSg9OrW3?cEE#$Gb6942g=h zO&+QDzzuyh(dUrZC0@Xt2F2P?Kr0m{d2K1JlfC&>{Q$r{&}fiE`g4Fl!A&3+kT#Ey zVW{097nH@-$df?r*|$#FvBsE)d8fT^ZpN4I(|K z#YP$K@&Ll zQ(B?{`2+MKR7q)F%P<@Emt8RBSQoz(G}k*XeuO`99Zl7ViZq4oU^7`~><+XIm#X<7 z(5aQGAy@OEA-6R=vc0y8KU*B7DSf2+RP~pTf9Vu?(okd)jkubn2|31iUA1o4!Qc)h Xvy#xhHPeEouOiz+$i#2jn*aDe{$Xa1 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/__pycache__/input_metadata.cpython-310.pyc b/vllm/model_executor/__pycache__/input_metadata.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..743d900c6b6762bf12d3d6613fb2d0b37795f545 GIT binary patch literal 1852 zcmaJ?Pj4eN6!&2awiT)X0rClLq;3L&Q$?#?fj|hhAnn2-7t0+x$t07RsqJ*z zYHsX>Prw230l4#BkmkyXJO8%33p~#>*=bqf$iMge#(vN5pOf`^A%gMdcRx?RZ6oxH zT{MRai@PxO+W-OyCqYYR>^PRk2_EB5kZ=Wgj0E`xk6nQuph4?b*oy{E>3@-%I7@kQ z6j@1g?84$MOnnL<(by4ajD=%0NFrR>ntOuCwxzEj(E`o~>TM+LD~Z361RE{{4ILSX zuINp6#g^zlCgV_aWiY|wme__>NA%#HJ7w?FG+&wf(s02I=RY|Jtrv8XDH_|)RLhf< zOe3j>^eZg|Na>e+D#blxWNIB8h9QMc>TZMcF!Wl`C`5xKJ4j7r&Ux>jzj zv}Dmra5hyupOstJZ(Djwnw3hAq7D@HdRQLA{1B%87@%+p#8BY^It2m5MGMd^JU~)l zz*f-)^a>xaT?BxB5dsDR0fquW1v^KBjv}B>c^067k@77B7Frf;dn;>SbuIJ&#sOqD zWydksgHB1bOeVFG>cGo1y+p~6NZkTmm-k02tws<=o{WOcW|+@ce4WzF8X`4S&o&USzDE$SzZSh+k5nSg;k>KMRsuM3C9OUf2xKAMfahww6HO4+_B-8eO+ ztBq1N@9Gr`w!9i(Z?5d$ku6aDK7fyigF6^wxHNw56a36;;itZbFTFN?;`{h7Kfr(D y5dQ(Ye#hPLx1BdC4Fgho@SebXTUl4HuT?k9Ro3fheNg*#Xv;}k>s{pI9rs^HJ?SR^ literal 0 HcmV?d00001 diff --git a/vllm/model_executor/__pycache__/model_loader.cpython-310.pyc b/vllm/model_executor/__pycache__/model_loader.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67564c02daae889773d766abba0ad8bcf56336d5 GIT binary patch literal 2892 zcmai0&5tBE6}MgP>gvz=+?`pJARnTLhR~A;A)zHgK^uso>@2X852YkjswumBYPLQ) zwrgg)RV_`JTsY-~5-F@Sog4oEcm4zp2&t}|_JX)@K_bJmt9Q0%89{a1RetZ;@A#s821gk-ENX zX*6l3p6{hCzh%dbq@8yBj{SC%ZrbyEY2WW-B=GSXZwh+6J|f)XtM)6CBH%@uPJ>eskB_vf8eD^l2pf&dhrru`7C}?rf$meJ z(FIfs*d>|0ejv1B`baP?Mq!a?rgIq`F{=f>Eo&T~h^!_J z9}JwTVP*UcWA<`dU?=z+>P6CkNDoZ%TY183p+6vLa^+v|?G;MyC3zGkdr5q_H<{|A zJo~}$jo0^7ti|^yVe}{*3$=HWBSv4WTq1Li$9gUu)X4fWaJPUQ4XR-lq@%ttY!P1|J zs@flhNg;kFWiI_KlO$Ja1EWI?sO-T!xdvTD4asj^&9=!=cOApu3Cs#Ld)2}nV5)CJ z*T-$g#pKc@9%*9_e&mu64?;~Z>7~477U^+&9+!KF%!zsSnA|Y!J!tChaIQ*nVUP$d z$lTEo<#Rlz9L*b7@jlRwrMjHVn`M(bcMzxB2)-J3&<--R(!HPy^a#lxL8M(X6Jouj z!k1Szp>I8DnwWzuWWd}5sb=Y&VEGGebQy{Ea%QLWAVV5UX%D0ux@Ba}@#nw8Xm($> zFFGYY?kw=Lwoy8A3KsQBdfbQg`%fIZ=2GgM%v&D; zI#c+aI~Zl&8%X{36wt%`K86OXt?{)-M1HB)%how?vToVB*Z}$_v@OGjw!NSjeT1?% z>zP;f{P6~R{`9}?fz@xpe$8%c&42m-nm-GXHAS=5$K(L#CtnP-xSsbiCs5(h5 z!)0y^$kP^Nf;0eYyQd~Oo|@ro_kF<0)%0hn0LS8xEMk)u@nCWSYJ6%MXTZ zA80LIvJ!JQmLaX!&4`<)KdEFar*j_!$7LL_j=HGsP_Utawnp^M^lujU5)+ei;iy79J*oUn#KYp^t!L&m~L4DLWJ_b6$ZbC?A;LzLDKwF13rLUrm#%&;x#r&M5qMICs1F)3}jRRql}GXeN-F{q1u_^ zH`u8h00DcH<)#UYfn|Tw0z;ey&%GwM3}g2N%!!~(nX0MkQe^8^5@#ZmK^;I_5u}N` z@773BHIG7Npigc9weo@>3sVsUKI95u9oz`gP|K#7(Y2JT`fNQ?b$Nc8Sq6_~HtAx5p-vL|4RPFm(@U?>28!WPfK~bpP-vGB zr*FZQVE2*|g1f-&y5#Jof3~}Ar|W?HB|UTPs@G*aSpAsL3OM-29ZyF&p8kzqBU}Gq{jQn?>~{!ki*FPIYFQZLGN4;D)p%%zG0w ggV;|Dil~>MqsRl7dEo4I(=er*c-L!hP~5-#55=M>jQ{`u literal 0 HcmV?d00001 diff --git a/vllm/model_executor/__pycache__/neuron_model_loader.cpython-310.pyc b/vllm/model_executor/__pycache__/neuron_model_loader.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad4f3c6c252fe18ec4f03bddffb82ce4d4486f41 GIT binary patch literal 1941 zcmZuyOK;pZ5Edm$D`_Qr6Fae;KCnf5*%sYx610b+DB30g0u*rq$0<+(1tE*vT{(Je zNN)Uq_EPtnTY(;e0`akz{+$B4_S8Sn0=>9HuIxBKwV2^>W=PJ*nc-G(99r>L#~v5}UY+#!w`ap4lz=y2{89nvwfoBKsTf+8fL znR|J+h)4u-hxsgEA?pG~EM_Y(Uu8ab_G1Q6?$dY_9>9nam;+w;)Ou>7)T>sq}&7~OSRhs0ZJlh-1j?}a&Zx3&NGLnGn<5`kE zO(tB9p67WnS}x~@JcV?LQOW0`D%+5BbX*K)N6iM+(JXy0gW7}x<#k{Y@=*xnBm2y@ zedHm0hR?*h4m4^*TqZPTrfldkzo`u#=by%=>02QAGceO_RslII>;*v-CGB*dZ$o51AklSVuW^xw#{v|{jdombl1F^%Wo0|HB?So z%jzrBAD$_yTG}LP(_7!g!5tYxm*Ko%{6e?)Q)G(cOo1``(kChuie) z2akx$6g*la)@Ltk4bV22KTq;OB(CX*b&WRO(}P4z(U&yubZOd;rS^>rIDJR;83S9w3Fp3*9!vg^qMwbz_5NI>_6i(D1QI| literal 0 HcmV?d00001 diff --git a/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc b/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f31fa6a126e3d1a7fc1cb6e2e53f7376de321286 GIT binary patch literal 6385 zcmbtYTaO$^74EL?>6z)d?(Xc(mt+7DGGNap1QB9v;|N0tN;U>>E;NMJ>{RXS?9R>A zJ&t$O13_yk;UQAsg$F=55U7nq zs_&etexYD%cuxQBZ>>K(rD-2iA^DgnTtf2R1>qXkJ6czFbe(wCVO_&9x~5}xQ%`jel_n9*% zbd+tLLpit4_H}2T8&|bj{!{cqtLb5uH~dDk)9}2S8Jd^dO+PeWX?uQ{d2Qgg`@Kdd zOl=GY9kE_xVex9CJLt4~tt$=L=z3v(Qr_4b2q?>671xHM*A$nb6!ook&wa=1gxR*| z_QX*3d#!(NfBgPm?|yQzN&inib0}Ow@;(F+nxk{gVO)2NEXx|a%uQj6l(3M}tD4Ac zu~m&;o`O6p(!!1;+*0*W2j9{nx0B}?Rii+BTqmpQ`l znA>8!$V)d(*o(nlb76Th)>nn!Afq*UOw%TX>Q-M?+r7ciuX?c!yn6L@PjE=99+^$Z zx@}kSyeM1VImJ;4)!mlt4+q}4DjB}o->N=M!Z!ELL@i-(cc3g@s~>1b-8ff;(NVd2 z_9Agp^UrDyWWPJ`-HzzdFGR29Z+rBYNbb!ldZHtmzTmFk-xWQ#-Q(@1K${IJVjQZ7 z4o9`rkk!c0znPL|!xyc-Y>%KP8uv`m#Duh5sK4qzQ^kHiHq=a6OZ0?n_%R&P z+Bqstwb8V78@r+k4bxWDey_7v-EIs9urp0MedMxz9Y1-eFL#AJQ*HEkmAAddW=9-Q zHt8=l>#J@>Eoh1`1D=Q{X?-KKXX_=*5Kn6$%q8YHT$;?#!Fdc%JbG$EduHjdeArr{ zC51nfy=DembMg?64WNA&$@?Klpa+_(1q@UV3{V!Bphl1aH3JJY6{JC}z~I_YhwamW zu}`fsT)PTHoNzU^E_G2kn=JOP{?}Hd>u*hY5((CvBQE`pH8) znL!!)n_$RLJKFEzHjwMcVeB1-9Om9Jki+0RCUTg3Cxtxy#{wWMvpza?_>T%`BJ{a1 z$^mZT1$ALmJQL%C!i@`~Rr+`wJX{!^K2v@4%DOPB)S&9n{=0BbuhpzaPY_LPO_$5a z)C5Ij6-6?v6a(qH&!FUybr~~t^P$Pi2c}`(H%;?%Gi82eTGl=G@K_^9?Xk)>Y73;V z*XRn@4Q|^Ei&D_yTsUoqQUOl zCc`kHdp!v)mq=Y1>uKU%U&mT<3}NoTv2u1^i#cP; znNuz`<*X@}o^qKfmz{DpxHKFqoT|w1B6MeY3Do9gk>i@k)4OG^YFjp+45a5f94q1AL`G$b~m(bo@5zE^NujJ?|xi z6|63VRzp~b~@IPWmUefU~g z!OS;l@lKv97=l#=Dz|{+Ep41avXIh98Kf+djg&*mBNezlE+UnX%0BRZN8i^kYai-w zpZpmc&yDAUdE^V@N>JHh<3&(oyaZ~FmqAnG6;Nxu3Ys3D@H5D>0o!Nelfg;OcA2~! zu$~^AM0wG-cQyG6H=w2DIl(G7e0xRP)x5WYlL6Z`q{Gc|DJXlJ!3oqg0f`;txOG613-X5~`Jlj42kI1p)U6Uv^UO`nFGqbK zTkq>1>ADtJ_?CUdw}s2v+Y95<7{^?28sm5Zn&yKib{686#8-&F_V;Ln{$jAOuO~;C zki;WQNa7JDBp6|^)Y20Yj4)U}Ai)TOl>?H(wbTC6t!4Cq{y{pZP6}}XL-b&NlpjAC zJQ)-N%<`6rEo!fS|I>$fTG0~z$9Fb9CC{h=g#-SQf)Yv|(j8j(ztRHj?bKLrX>Vv4 zHltwUw!W@V_cjYz{h3gA6Bw-AO!hoQL^lTV;x9-KVLp3zUK_n|2q)@Wh_0t``Pp`2)#OTdy-K7>Ad z&fH<_*>mQPAkQB8F2GrsQYaQ0$Rg2hSa7et`0ABcUS5C6{oa+Uq0!j%`FdrmotZ*=aFi!{`)kS1H%w}XR` zSF46bv*XEU>EqXlJV)dkMAnExZ+QU`JJygvh5r-+gox1zgF z2Ia^AdfUlP!)+%`EIQ|uruZ8j%H(&TG}H&7z7^_HUZVfJNQA)Eu_p!+J;#bvD0?{N zsT!CHp@MgK2`(MKv|ff*?^XP0eM9?nGH?Qct3BBo_f6f(411 zKke(3j|)By{+!*^f($|4zW!_dJ^ej~Be}uNo0&*DE}~XS)yl@TwD&Ys%Tl$JbX@X{ z9W&5*n&5?JK7gB05|)t6DT$eo%qvMMAz4roD

UNjf1}R1$-yk*DInEh$-1$*fqm ztYm<9M8ZtH5F-e&R~s!}Y3Yl6e8SQ2vmiS$Jfx}=TH9=l}J z7Oy`BY*M^U=Yu<61Zfer7sh&^zpuCS_py?wqvWucwcoI7xVnjU)sIkD{uqRAvEm)s zZQP*4O#~%Vs15D&)RiVM>f$XlxI+=Vg+oT=q4jW7f2_Vw6+F_q@(Q#@m_O~>q5ZEc z_h+iM0dhps#0>Ivs-n6-)kK4gG4=RC$~V#IFVyfY5WwnSSTkmD8rlQIv0c&a%e{>n zQy-js4>9T{+}bWKP;fM#VXT7OV}N#s%C(H7+fkWQe?y-nO`c;#$S1!*VzYzm!I1g= z+Nz}cg#YOWhyB4%Nqr@r=s|lSHaSXmQ9dfrKU3ZS;D5zdP#-bOl@ByLVy#d zwmSWWuS}NMU(?;Lf{bsYqW5zUTZ3Yji#}aPCAcZdsoF` z7E292K7=ZNfrcT97-l5F6qh}r1bQp6Qk&YTL!H#6F0SU(s8`OZ(>nBa(ny=MnflZR zzr$Qs`-#vN^Y{|0GcvSUgEhai=`!g zN_$5W73W!$bcpu*{30Iky*wMnBi%eOtE&&{m!ZNWj~EwpE#rmAvv6?`*4u-EMbxdF z+LoL%?Fz06kqxlfdtQufLEVEYA48+iEuNt}jFBSa7q)e4&quAXbL-wAv8{|%t40|* zh8ux1z8$}t*Q{h28O|pr0;K+kBpghWoC~6spY|qGb(UxEci(-dCu7CmolB=)+SOQsr(9(ydfqmS7UmM%^Vi^Xsd&%R`nbLmMhmlf34hXm&iVX%x zwW9AMFlHde0tuJlW3X@09qOa69z7-3YCrOayD#$OmxD>!DNH&%#&PEHN=G*Ryk=!Yl84wk)9SF%^1!>%-EYi3W z>;?=}Zv-!aVt^qN| z=-8qSS0;=Cv~|v>+PR35f|qKmnkD0E8^tVQ--J=Q4vp8cK`sQ_*uD2aHcQ0*N4#d% z%#(Cb1$m`gm1&-oA_30`j$cOZxZ-s{D9rh_dwv;#0O(q^Y4se<*U*hB1ZOY3 zOVocJDja67&@8qgY8PsijKs!&6TfG{~J%sVCw zm)gmSbTSo9GhkAiHcNUc`h`hYVR#o?&}b{pRGIJ8Jt?zFTT+SY8cCLE_n_pVM6rO` wJV~O|hxb9n)MN9Q$~U30kq7A4u}3`ICd4AxeYWZn&&C8a(sYkOm_9~)ci-5J}pSUVhu4N{aq zfNFpvivd%%H7VPNvAw&MILXE-D%qrJF1h8D!{%|!Etgc`lvEC>y(N`O4r6zdPTRW(mQ%!_DXK4XSr6d?3R1BYxgQ{MfMefYH!A! z>DAntd}hI{JBN3S;81Vgo$oEU3%&c?`zC#d-NWcB21j~F-J`P24371VyT@g{6rAXt zbWh5<6`bnb@7~{YTt`*>ll}u+s{7yz3NLf}hQe+C=$7U_#4quRf0|c)+dq9&Y3qE3 z*KQQthxsg@yP>#`@I&aG$9I9Vw!!b?hi~Zaqx^_}oGet%9M?_f5+Omq45wuTvP?(hdN;~7bTl}_-l-9usb?NgvqKdOL=CFvYq zJ=S1D@y%|;Wu7Oxrmg;I&=6>571PSl9l6oQ2z7=j}=1Yk$yd)Bt{&F`?GV{Cr zq#K1^kQJ^D`hj0(*=!to{dgrxn!A%`_KWT3yMcdYGfw}2 zZ$^DT#N>Li&U?hfE5zKhQP}Qw=#dz222rQu3(R%6<0sAaffpv-p+}4~TXJMJ_r0mc z{OuO4Yx<*29^wJY6!89SgMQ_&bYSC zxPDj}X^A>!iMFbb3n@#{x~-1%l(x69G}IT9UjMebcw0R`EHoh4wYrfNh2IanmYXQWRaPq#neIS z6PqY%YE7M2byb|h!vS?GqJcyvSN$d`iIS*W>PW?E*|zeW@>bG|LTQ*9o~3) z__eDm@ELG+owz&n9o}uXec^|R(~V&>h3_OQUg%tW_({vz{hW7hvnQRQJ>4?2_Me82 z5zT0j^aqJEIVUjAP8cN)cxkOjYrW=gI`x^X0JLtc5N3Eh*LbZT^~EB|=rJnlD9D-! zc>>Iyx8~y^7G-MJX~(i6)=J!G_V))6c+dZ~5Rnjc5VC?#z+JD1uM;tXP}iWjeC`_g zS#+zDGdXb%GM#1;FVWB<4jfT-i#Ud|VhVsHy)O6CEulSMqFch)yn6TMp8CIc@d9Rw zpFvU8O*Q``6kt1Zy3MNUoN8+}&_P$3s_)1z)9)~5-7$@Pw5?V#TK`hK3L=@En-|Op zcjABwlRyZ0NtNSE5*m`ZwnX@nirM$@&<64{(gXO z2m~7GNp)P6v}b@oOj@mibW(I)xM8LRi8e-3lPGSiF(BV8FJczpM1Ha-eI^j(I{Hd^ zAGcBiBXm0Zz$ODIQB3C2A}@pE%E(L(jptJnt0~>X<;T-pN;#BB4t0CUJ@ggM*w8rB zSdF8wu4mAY>)jZBT2}RfAle`U52z1gX*V;AysO;JIj}iogUHk$qRlyjqC@yHi>*RqB_j#MlI4#c+lIt0JY2nuhh zOA;7AnC^-N$m!y;-s95|EQ%a92vo7xuTq8cJC=8{l z8mh6wa5jvc;ptz68$r?@I?s9`fl|ARXdwY+<=o_OND+m!0z>EdXn+Adgb7`^|I?%L zz(af+)G}4n7X$^Tu05?FSrH6D1B6>zY45u3<+g~R1i2qykOy`O?F!u(5GVbj%-#(! z$kh^bdd2stAfF-AJ3+K8NLj>1dbVEofK;BeUOsoja)x#_p$j>YcmJ#B*gwam!h%oXSwTJr6y-Mu%jqbq(H zg{7K|pv`0!Hq!>!Rhgbc9$?S>;=~TrtvIh*7-j6(I)js>@7Q>PvxFMn)NQn|pyu?0s8F4SkzF$Or}>;v{@0n|3?aNFQW>AVV%w(o(z(KP%4q?M5k#~Sp& z!3nrr6$#k@;PFgq6DQlmdj;H7Cfxi*YH2rr4^t!l>keGHKZC)Mm{#Mn zfcJ`o_x*R7_=lv%=cMCz=rc`%x1Cl}qpbme=a-%r>)54XUJKVkqzarV&3MuQDj6IHwHxH;O9I64*W!C}2jslKi*f3@ zAXiX=y0}U;Ql@%gIGYF$ac7$ z$?5@2KVTgm!9>zJ%-n4qNRnG>{Hs_7@fz_=2T;60#Sf^ENGn}aLVfWTioe5G2C(d% zBg*iJKH^10FioID?q}?#RQ6rbJ&2ydd{z855p1F$Y+pvmxklsmv0ZrR`#DW!qc9YP z;E>&skbds}GwJa3G^)eU98CQgEHZPb)qg!_K{yEwGoEI#HwxTfL%Z@ zfOgmOGzoaorZa=pR1==34M2wPd8ek^8u_64-!3H6n@2E)XrT8?T{ktWC_%cAgS4fI zcQM%km?deE1K+|o{s<1syKr*|px{C3Dg4lzg*P>d+yRF<;Qj{yQX44o;qNGj0M%6) znQ|QvXyQpvO^Qxs6mR84=ZA6^-&TK2&h<)z?)BZG= zrq;LyTJV9_Us}HmdL`+NRzYueJeOK&?JH^gc$&sNGi*=yN}9IxaEYb~it5O^=bgWq zekZjci?aamiS(vcTbWQQPbgLIq4X~YC{>c;_)uEiR@+QEntxA@W?KA26%R>?pB>F6 z^XV+F@|m0L<71;*S{pA&iC17?YUBGzI-@E8znW9{L|tbG<}6)QK2hW2)3dJsx15D` z^!{JEqgKA72Pj|qT^svOjA?K!CXsiM9Gsc{HYQNP@?Ahyp=NX2?K3 z7SB@)d2xAQyh^qAXn;NJY zL&SaO6gj0krjgTNXwyFzA7fhKqPWL?Q`k)oiOj_l+|gm%Vllii1l}s8-9~7IkHfsb zg~5jpO}QWE)y*qN`&DB+)dTHgZ!*L$)j- z>1QDm5JJbX4ta<(Bj2F9Ds|)@m7Wl7_BUhNZwga#22ChLO!)!ZBE=OyMpb--BCF(! znK%On(!nsSGT9@w?ld0gx?mdyk;*j|TVzZrRe}fyc>%hjB<&mC_gsD}Hn}g8+5n%F z7NpBWn@&kYx6q3Eo6S0te2AYAN78rPb0D*Y$@SWVIfR%KPypp9EJ?(}he)^OhN9iVtuK!HIJj8FmtnOvlGcNaZ5-IFbj z<)S5H`HHcqWW)DE=ln(g{NwV(%L&}6lU8Du<~m8mkP7n9GYtV+9^*VgZLd<1Cs&rED9C-6A5o(; zdJ>Z8G{Zj%GvgI`d72gEzURBKj}H3~VO|D@h<=~K7Ao}w)+RetJau_SkNRh z%qS6VYBlYMVHQ~#hJA-IM2VX9<=i8zo*R1ob80taJNpvR49qtByk=Op*{feQ!k8K{ a|4nlSVGL4<)($Hnv-G90HwV$Ge(}H5iw*h! literal 0 HcmV?d00001 diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py new file mode 100644 index 0000000..a8573f8 --- /dev/null +++ b/vllm/model_executor/guided_decoding.py @@ -0,0 +1,99 @@ +import asyncio +import concurrent.futures +from copy import copy +from enum import Enum +from functools import lru_cache +from json import dumps as json_dumps +from re import escape as regex_escape +from typing import Union, Tuple +from pydantic import BaseModel + +from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest +from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor + + +class GuidedDecodingMode(Enum): + JSON = "json" + REGEX = "regex" + CHOICE = "choice" + + +global_thread_pool = None # used for generating logits processor fsm + + +async def get_guided_decoding_logits_processor( + request: Union[CompletionRequest, ChatCompletionRequest], + tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]: + """ + Given an OpenAI-compatible request, check for guided decoding parameters + and get the necessary logits processor for the given guide. + We cache logit processors by (guide, tokenizer), and on cache hit + we make a shallow copy to reuse the same underlying FSM. + """ + global global_thread_pool + guide, mode = _get_guide_and_mode(request) + if not guide: + return None + + if global_thread_pool is None: + global_thread_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=2) + loop = asyncio.get_running_loop() + + result = await loop.run_in_executor(global_thread_pool, + _get_cached_logits_processor, guide, + tokenizer, mode) + + logits_processor = copy(result) + # reset logits processor's internal state + logits_processor.init_state() + return logits_processor + + +def _get_guide_and_mode( + request: Union[CompletionRequest, ChatCompletionRequest] +) -> Tuple[str, GuidedDecodingMode]: + + if request.guided_json: + if not isinstance(request.guided_json, (str, dict, BaseModel)): + raise TypeError("JSON schema must be str, dict, or BaseModel") + + json = request.guided_json + if isinstance(json, dict): + # turn dict into hashable string + json = json_dumps(json, sort_keys=True) + elif isinstance(json, BaseModel): + # use pydantic signature so that different model classes + # with the same fields will get hashed the same + json = str(json.__signature__) + return json, GuidedDecodingMode.JSON + + elif request.guided_regex: + if not isinstance(request.guided_regex, str): + raise TypeError("Regex must be string") + return request.guided_regex, GuidedDecodingMode.REGEX + + elif request.guided_choice: + if not isinstance(request.guided_choice, list): + raise TypeError("Choices must be a list") + + # choice just uses regex + choices = [ + regex_escape(str(choice)) for choice in request.guided_choice + ] + choices_regex = "(" + "|".join(choices) + ")" + return choices_regex, GuidedDecodingMode.CHOICE + + else: + return None, None + + +@lru_cache(maxsize=32) +def _get_cached_logits_processor(guide: str, tokenizer, + mode: GuidedDecodingMode): + if mode == GuidedDecodingMode.JSON: + return JSONLogitsProcessor(guide, tokenizer) + elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE: + return RegexLogitsProcessor(guide, tokenizer) + else: + raise ValueError(f"Unknown guided decoding mode {mode}") diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py new file mode 100644 index 0000000..1b3e5e7 --- /dev/null +++ b/vllm/model_executor/guided_logits_processors.py @@ -0,0 +1,129 @@ +# Copyright 2024- the Outlines developers +# This file is adapted from +# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import math +from collections import defaultdict +from typing import Union, DefaultDict, Dict, List, Optional + +import torch +from pydantic import BaseModel +from outlines.fsm.fsm import RegexFSM +from outlines.fsm.json_schema import build_regex_from_schema + + +class RegexLogitsProcessor: + + def __init__(self, regex_string: str, tokenizer): + """Compile the FSM that drives the regex-structured generation. + + Parameters + ---------- + regex_string + A string that represents a regular expression + tokenizer + The model's tokenizer + + """ + tokenizer = self.adapt_tokenizer(tokenizer) + fsm = RegexFSM(regex_string, tokenizer) + self.fsm = fsm + + def init_state(self): + """Initialize the FSM states.""" + self.fsm_state: DefaultDict[int, int] = defaultdict(int) + + def __call__(self, input_ids: List[int], + scores: torch.Tensor) -> torch.Tensor: + """Use the FSM to bias the logits before sampling the next token.""" + + seq_id = hash(tuple(input_ids)) + + if len(input_ids) == 0: + self.init_state() + else: + last_token = input_ids[-1] + last_seq_id = hash(tuple(input_ids[:-1])) + self.fsm_state[seq_id] = self.fsm.next_state( + self.fsm_state[last_seq_id], last_token) + + allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id]) + + mask = torch.full((scores.shape[-1], ), + -math.inf, + device=scores.device) + mask[allowed_tokens] = 0 + scores.add_(mask) + + return scores + + def adapt_tokenizer(self, tokenizer): + """Adapt vLLM's tokenizer to use to compile the FSM. + + The API of Outlines tokenizers is slightly different to that of + `transformers`. In addition we need to handle the missing spaces to + Llama's tokenizer to be able to compile FSMs for this model. + + """ + tokenizer.vocabulary = tokenizer.get_vocab() + tokenizer.special_tokens = set(tokenizer.all_special_tokens) + + def convert_token_to_string(token: str) -> str: + from transformers.file_utils import SPIECE_UNDERLINE + + string = tokenizer.convert_tokens_to_string([token]) + + # A hack to handle missing spaces to HF's Llama tokenizers + if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>": + return " " + string + + return string + + tokenizer.convert_token_to_string = convert_token_to_string + + return tokenizer + + +class JSONLogitsProcessor(RegexLogitsProcessor): + + def __init__(self, + schema: Union[str, Dict, BaseModel], + tokenizer, + whitespace_pattern: Optional[str] = None): + """Compile the FSM that drives the JSON-guided generation. + + Parameters + ---------- + schema + A JSON schema that encodes the structure we want the model to generate + tokenizer + The model's tokenizer + whitespace_pattern + Pattern to use for JSON syntactic whitespace (doesn't impact string literals) + Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` + """ + if isinstance(schema, type(BaseModel)): + schema_str = json.dumps(schema.model_json_schema()) + elif isinstance(schema, Dict): + schema_str = json.dumps(schema) + elif isinstance(schema, str): + schema_str = schema + else: + raise ValueError( + f"Cannot parse schema {schema}. The schema must be either " + + "a Pydantic object, a dictionary or a string that contains the JSON " + + "Schema specification") + regex_string = build_regex_from_schema(schema_str, whitespace_pattern) + super().__init__(regex_string, tokenizer) diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py new file mode 100644 index 0000000..f0a88ac --- /dev/null +++ b/vllm/model_executor/input_metadata.py @@ -0,0 +1,54 @@ +from typing import Optional + +import torch + + +class InputMetadata: + """Metadata for input sequences. Used in PagedAttention. + + Args: + prompt_lens: Lengths of prompts. + slot_mapping: The address to write the new KV to of each token. + max_context_len: The maximum context length. + context_lens: the length of attention context for each sequence. + block_tables: The block tables. (Seq id -> list of physical block) + kv_cache_dtype: Data type to store kv cache. + """ + + def __init__( + self, + is_prompt: bool, + slot_mapping: torch.Tensor, + prompt_lens: Optional[torch.Tensor], + max_seq_len: Optional[int], + start_loc: Optional[torch.Tensor], + max_context_len: Optional[int], + context_lens: Optional[torch.Tensor], + block_tables: Optional[torch.Tensor], + use_cuda_graph: bool, + kv_cache_dtype: str, + ) -> None: + self.is_prompt = is_prompt + self.prompt_lens = prompt_lens + self.max_seq_len = max_seq_len + self.start_loc = start_loc + self.max_context_len = max_context_len + self.slot_mapping = slot_mapping + self.context_lens = context_lens + self.block_tables = block_tables + self.use_cuda_graph = use_cuda_graph + self.kv_cache_dtype = kv_cache_dtype + + # Set during the execution of the first attention op. + # FIXME(woosuk): This is a hack. + self.attn_bias = None + + def __repr__(self) -> str: + return ("InputMetadata(" + f"is_prompt={self.is_prompt}, " + f"max_context_len={self.max_context_len}, " + f"slot_mapping={self.slot_mapping}, " + f"context_lens={self.context_lens}, " + f"block_tables={self.block_tables}, " + f"use_cuda_graph={self.use_cuda_graph}, " + f"kv_cache_dtype={self.kv_cache_dtype})") diff --git a/vllm/model_executor/layers/__init__.py b/vllm/model_executor/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eda52f12911301bf03f9908f5bb70058effc2e1f GIT binary patch literal 171 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!HYera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_er|qBYEFD=MQU+0&N_g=mC)tk=TTtUNi`lr8J{jX;3&L(a_0Vt^!I|YnaEck{I5=~B z9-R4%a{`SpU1w4SDs8%2JapZUOYpT!r?atB{@ zvM-+F?2cA>w*U0CUKDrQj$e`+B^qtj^`ncnWS*18T77({Ec4F-@6(fBo?R9kscp0eJDA!OP{R7BAZ(#$%=-5eZi^cjj8(?d4bHIMg z8f;11(WyOP+)UdG+8u4z`s8bGZLV}=eeFUB8+ss|X1g1Rwg}_Y48{E(EE4OCow!3n zU-ucy`vN6lDV==$}0{;G(U9|S>rIW=9eA?m%jgV+2`A*1E-9;tRW zlf0MB=@iC)L`FEzp4a=w#uii^_Sh#r3q7eQ@!ddWzxY%O;`}iLH((9@xVB})dW#JV zJfUK8<3nT1jIC{ZU=H*hrfUNe0yS&0gwf!OmAW$+UV%c-v= zmS}h5O?jI7pCLjzb&DiG&kwoR?ghJ+e2Sis9Oa^V-08(IZ7&tLftC_&N=~3hlFKCG zpz(;j`npc14xR`a-KwIC>xW^2hZ4`^#D7PnZArXZ_plKm6Y>FK2!7S1NX5w*7bwPWQfy6V`2}o79Z` za4j*qopo5n$d4p?{UB?&iVsD!PfKiGFYAT9ekoh@!6rfGN1K+`{D`cG)(lUG)y8Za zfhO~cKsdX4tzxEADkz`l1pF$1}S9|iZw zPhj?L2|M*zQn~{qBly)3Z13McBvh?8<29$-iQ)?pz2XocZ;Kd!W&~fGk%RwUB)9^% zQKu~&#NQFTchG;W!y|__k8|~Vx2h@bttACuqAYADN1Gu=Z42J?W1*lbu?R6VqtP)S zDXQ;Bo`NTJ0z$(phYIZg(ykI|tUbiW-of#0pbs>UAdu!614Ayx=0k1EiY;t)>ml2+ zLpA^%m|W{=(uwWu+>Qo_l;56-?N(txPxKENaH!_Z#>IiwnuS}`T60bPA>b1y=V#!Y zqk>2>mz_3(j?aax%mZTfx?LOX!yMVcLl&$ z7$!4s4P~xkRbnemM3VNV{5A-}nR1e^65~1%H9GkmHy3x6Fx|6H$k)TPNV0L0PG`G% zGUIvmz>gwerLjGK2cx`?5)slXWi$uG;LPZ;i35ZYSMU^{FZHUdkr zc8o1p8lX078TVN9%y+aO=>rOxu%FCUSWO}qfqiZKS@cN-fLuHDrR=P`R=v~R z^oF{ni8y)WR_f0BUHzWq^R7PS66w8 zsU0SZ7-*j-qFp_*pdX?^Mh%aGd#pduWhrK`742sLSHCbG>OazBw3IdBoZHjog#m1C z+t`6gqt3w7FO0qYIN#7O zlkW~Yu>&vF?Z{a0YOVk36TZS}BhlmB6;9L@^=6|v^z|x?9F*sXsue4-g3dZz--37b z+RAJ1TwQtX&6;=n`pwtwtla)VQh3J?dg3~4UAhb8`IKAZ9toai94S2lR(>BMls_PH zoygd?*{P1EtUM{uVCka|@j_}b`8*s}K{xmEa9IVN?d2I<7lRG$nM}Tiesz?67Xf3s zq7Y;%KvOc!K#~nXnp-F~N={f5mKy~XpbLB!0oqn|3d?ynr|JmHsOOQ+LpbLus3UMw zLeF$(RfsM^rwhsAt4LuWh>j1wx2W0w)ggfz9&VIZqO{N7omLuUm1wksl1g7BX*Ag< zOAIMIxRR7tDbDkBk*V&JQm;G3nN4?|GL7UIFop_RDpf^i{4iGAueNXh(cU9)WsvUr zAQMPH+s59DaemsiKm z(5hNE7&Vb^`w#u2Y;BVqwMGIEY+H|!OjhWI0g7rZd#azEawqzwoZM8uoa&dG=%+YC zN##>XF@!Z`3rav|FwbO8Yd`0$GxK za^wIflBbzxBh@7)(#!1$jb%`uWa}2j-c{vTRrUer4GPMM5x2Wjkx}J4e}^|C$^evM z`!6UAXkri8k8KmdkZON1%>}AG&Ko9MAbkJWH2P-_QFMl^al*ccLC0jJGItPGOf2QG zoYf3h$5_?PD>PUQLxpSIW5Z^3_|ujjxP@_DWh~ts zW&W$)wKNz0W!2jMJ3ux49iXbhR`$!6Bv;~IJg7IJeD%v$FKz}=b<_m7M`WNFr;d;A z`7iLY8bTd3-KB%Nr_X6?N|p2=_o(ucDv_kp7YdBICZ&s9^R_@n@m+LE4CGJb+tght zRKa`#Q%c$s%2OI7AfM#Fk>=@NA+H+I0n60#I6&rE5uT+8T2i!Zl#k{HV&e6qOoSB#bfj{pDw literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/__pycache__/attention.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ae60e034ff1f31f16f8d1f95051d670b620ee3d GIT binary patch literal 8623 zcmbtZTW=gkcJ8jeOixd9IHV|2q$Rs8IUA2xCfD*NSt!|yY>T#Hi;^X-gS5Tqw5Dpv z8O}}BJ)}f5cY{d5J}C(D5(GhjVX!aGuUKq;Kpq41L-rvLfq*`_Zw11_Cqt^*Md=Gd0nyzVF3pB2C*3tuA z=mW+L&JI~%pw4WWZ7Z z&j?m{SvbuX_^de1E4b%y&!grHKOri0k1z0(_s!tN;vzrGmsHPFs$5p(idf(;DC)F$ z2{YM5FKhga@G%~AeV)I_U%GDvFXPQVe;IFnS*+l_XF0p6RnP68yxMOijn}tD#Qi8q zL?>xbD|Em4DPowl_s5?#j0s zQH)87wWzi&LYhj?X0CL4{p6}hA|53XD(praZZ~>W;~^dP!#|}$!--1|R1NpkH9LHY ze+oiqfeyi7Tn~&QqaQ!T4PkN<|EVJ&N*onc*(vwHc*T3zQZFWsMk zs1!x9&eTzwfs2hVH9Z_5l@(PMsjNGEj?d%xW}eaG1m;E`yk%ibofKuvOe4#DkuPE7 z?2k253o1CO<*a-?+7kTI=#Zj+#LBO}cWLeFhnLpg{qU0CitY;O?_o>*>7J{u>*Lzo zZp41A6~(ciL^~q(Z})|~d*0s>D54JcccWHc_>cjdN9;$j-=O_p#Xymhqc$~NE$a9J zFT*=)-A)oUI(|=f+r7k3x;vP|?@E75bcBpZM23~CLr%ny5#9wKn8UAkTdnS1qqB7} zXZDR%|AvUSqn=Q##YtC!wJ{^$`57#O1?uwtN4^rH{ACwb{p&*3yRz-S+8zu0T287h zSlg1_eoye%)l+q%2Yx>WN5fSr&S@X-&|CsLgay*;rZ?NK^}%f0@AmT7zex*8`Z8y& z+fT5i!+p#M!eU|?3Qi5Z*rqQHM8_PpAHa*+c4EN#>gK%@czK^>i|E z28(#}63vhQQ^Wyi?~ol7@L%G^v@qg>YtR~Z<5NAecyhNVm*c2 z+b@kb-G6oewW<#_>2wo6?)Q3KnFzk>-yD&?GL&>a@kKw=S2ItIZAZNx&H+klVU&=W z6NZgWBMC!U#v3G=qRgtdx=|u4R53>dd9BQXWY@McJ6F4`pgO&IA()k7i&J2p-Xm(QLCV*9 zca!aI=grkOzOfcJ67hO3s_no6#%sH+R(q}8<)RgeJEGRd4z4NV7q5+%x7xcaU%*SM zxa0FE+@h`9x~tElbo2^-3#_cW%+s0fK6Q<w&9wlun>iHDwV4vSOo6B8JswW|18QcxIL|_B9i;j4tAc zEQeapPeG*m03Zo^rS$jEIM7oztT9l_P{UH=SdEeDP4g4uz&x-Hti*zzTFnCNh>3a! zH+<;>Q|Tx>fMrRnflDoidb6~kp$6KFx*2Z4I^hk2@}Y+MLRv^|qL)Bp##!jO&g}8b$Ib1NZI#pZ}amz+x+$K zY~S92g)n%dz3+j}E74k=%go(|*gGQDq{2{la#8XO=)g>5qaT>vj)>L!qI+-2@MWu&r zAg91nD%k?*JM|TOqV_j2WlS%V^m@cTB#XHE&jgRE5gzW+f$C!-uj05n+3X;--Al4EP{(s3h(ma4e7n z?GxB?oS#0>mF$syrxPXF%WL41R1L}Abd8bPP3xct560lBz6qbC{u6spfR{5MgH~ER zwBV!crXzo*`f2gH>PP=}vp8@MHE!%M`4c?BufY>`NG25}l9@pn^|O#S7qt~Nf0Vj| zIeJpHZfb|6PI0QIszZ-?`9pZN6C~M6N=s>JumFj54gtoo;*;FK9JJD$q7~+pku1XN zsd0)g+JjTLmvMXCdf*&*sW(_jb&N0F{-@-H!Ree%&J0JD)atQ9jDL~Vfcqufe)4jv zahtPy#ytcyotfk;M$k@?E}k2{$GcZJAa3;$x79-QmrT>9B?BlLkQ; zNp8g#$M}hAlqI_8=K$~9{>S9oa@cs@AEP1vHRWy<9Qwi3aLC$M3?8AS=Z{wc`v00? zEbYf;xkb@WoG)U;$N9-J*6y62S_tih8lo0?-oG=w&LfKSe|yhYy=D3p)9QKAA1NxG zO&GrSv=$yMRy^ef0-?@9#y8WIRlD0SX@Cf1v!X4^!BPw+10fhN(d7BIR6aTYE47R{{ik3})@S@~GuqhO(p2rX1tXkxqNIhy1BOC>;d?wN#z`a- zVl4AUJ@O^MiP3tLrz&j0+9!$3Dq)Xe)(H;^_*G|~<>=&hX?5SD;u7)ZPHaL@+HPYR%hBjIcjrl~L=k_H(ba zzc_6^SBESEEpBW8aUh83L~RjORqCR!C%J1vE%7HP91Y8# zQ`_)4W9$ht7!U;em2I&vT$??%3+!{|lpmGPu>v;iiHUX-?WW~^!IqEKI2hqz1j7i^ zaUVImlHQbSi$YoM#1zQirzV1Pasfq+v}tq^fTaZ~IPo(O2|-u5{b8Vr0G57#&gyJg zqi~jJuF=ye4S_5G`x2c03hp5kWgICXKuZ7)xNJa6&CKnr~`l$QhmpeAEUj15SZMmz5-4V$Re~(5d;q*Y+*y5#7Z8L-Fc`^ zOQBb$z-7JW=Tb) zAJOpf4hJS#%)sqMP(DtYo!drb%O(OsvH?w37AsFr(6Bx5seop=;M*1G%}9@idh>V% z$4x?V%5n4sgk+{J1I)B875<$z2Fh&YcDqbrYkV6=O7{>5 zi6RR%3P-6>AVoW+1Mgsl`XwcZkaX{w`W`*GQ8F^T*72xci1&r!H5hb(Va<5w8%Gfq-(TpfWJo zEON*Wk7){ujRTuxWcNbmPM}rb zkUN2ciYNX(m?ws*z6Yn)pfn|CVm!?HZVCwdlt~F7Xk=w{!&3Kmo zH0LC}xrTe{S{gpZWB^B5@1m}L45d*t@kziE1~T3TFChEv2s|MVUBti2frEC3btO^Y z9zG-#@xj7G7Q4V_xn5sXx$p{~LoVD^^gKUt-w8^JrcV}5;PFK@jy_rxIeoM!@@345 zf6Tu!rd|*;$eg2gm|;K7&)oL{WZRMVo<$7*Qg-qk0f*N~rk&-+`oHBX{g3Z%PL4Fk zHp-SMtEN(HWi_0Vs7e^KSx|Wu&OK_!St)#?rTa%HkxP`H>p`n z1~E^o4udTfJ{{s46T&^l^~7kJ0}G{{7Eu;BzDYemsCUTF1A%1dsAQ$NAqljuSMz*D zFe9<>29RF3B_j?)O-Ux@)PwnPc2F%rARCu^ch>1-#p|>zRrex|AhnX)D5|9#cuf%Q zBpEs0E-9#coeJVc4V14ErA+?UiK5Kf&q?M94mdMMWb_NuW(2V)hw#L;_FsJF;Y|{1 zWT8eE(X^jxPl@x+uxuB2T7|L@Fj4`Zd=pf4Hs3!&vlG`An83x~%7s{Yn|G;M1$#;@l;2Ul4cSo`nOuZS z6xIqPJ=8;f<@_zZQY!XEZEJ+8X5;rQ@&kq3> zA4z&!RbQ{3lOKXYunvhwP$@dQlHQNcC3oqAOCIKsNXqk6s64ErzNoBrI$0dVm|=%czX0SO@wF6b|16X a%tzEVIyVaT@;3I^QNbRawEN42%l`wnK3drT literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b725c27fa34b17fa7da96df1ca1d2fd5ad096355 GIT binary patch literal 6656 zcmc&&ON=8|6|K*!a+QDW@pz_tCO;FB5GR@TOcL@VOeRbc5G0*sdV)~#DOLQ+-S(6} zc~zd7wmcG2B4rh%Y+!?+TSADIKw^W~uwuayHb_XdWf5h8gft7Jq-VmpugY%Q?Ve5+ zh-cOHt6T4T-@WJF`+9{!RzbPvXaCrE@-9XBC%sG;hL;O?!e4`M#ZkD*wLtA?j@Ho~ zy~7-)5>F2d$B_SKC*x!!j|Eo8c5GGoros(wzNc{0&$P4e>P`-v47b3sQcfNmn`gnv zrknyeIi3e6pK^-e6nGJwV#-+pXNi}bD`^5Hw*tk6C2Yzh4)y4Z-O^x+lKWtF5=~75v!r!3cD~<|r zXk2x4TjR&L?laDOgErbU-&dNd++e{mZIhR|jRjlSn*2Wnarm@N%g;%QT2niv$j6kM zHk`a|+68|6K+hu0acFN#ioIvy{Ovpy>2ypXW$8c zRz9oQY!&5VuhSnyeptO;J#)T#t@;Qmb#xHX!!LjT`i~y}VfE3f-w$imYnwiXxK?e2 z)o9bN2EHe{KCfQ$TN|6vsvPT*-}FT_>Qy(Rs2@I8uY2Np>*{JxY}CEC!}_^r&z)U8 z`}C7fJuO>*^ufRWu>GIE{Qi8L+iY>}cipfx^kW^f#isD1f#|L^^r<7Ex`BuOZ$1PP zDeo%V>W-o+k+!WzY}**AJNS(h*Boh}8Lq4=yXxAm6&vB8?~B-S-B!01xo)g=yRrR- zC%ldy;e!$NL}N2%y{?bxlU>E7tDb0i-G=XCQLUiY-PL0j`ax6Jm|l=A?W(cux{bgK zL)YC^K6$-92t_^UHN2o6wBD}wx1!Bn_lebWXX{}r^3U|W#uabF59?Qhpi}SkxF5Lw zb-yvd{Pjd-lRr}C)&7#-&Lu*C;nFe5Y)7(Ey9 zgg*uuscq$kK2o>UNNejm>J4=SwH~PA^a!;vcUb`+ZAR2WLyd85M{OHi=ZqU4XgAmh z{kP2#8!7F~va+LArh0&9$#)uA5zmj*b<8``ni^Ep96t5N)@73RnJz5#s$Xq&V2>TY z8zqM8zcXlE^#Z(;E;W1NnkV?fwbiwcHLU;R&rmMcv{>h@j+43K`+Zbuc5LuytMA7e z1bH0o;!HE>dC?P($9lhaEoNj0jv+|1#Yub@CqQb3SfTp8MD7D|vOUq-aOF^rH9md; ztW*+mH)_zLbXBME7Sd>@nO7~fgu|(575rN&Q!Q=kpwbQ>p)Jb4Uw}eFMf?UK?zqr7 zW$?(edKDF%jDnM?L{M{*ALNQN17$cDL!HPpGH?=5{2L4{GdL5Nz_{fO5PQVd6+wVj zvjk1zl$(Rl=dQ=OD>|V|x+dijGwwxRA|ENltl#NJTW-+0;yc#3>F&JL#3L9KFgl3l zC6!6{y^03m{UDafR1H590@76Wsg=pfTEby_%p5L_wL9&*s|pJ}lMQHVqK22+5}9%r z#tT!Sd9xbD>EPEixm1tHB6mGC(bOTn&ec1zQ=+nTQ3(pYg%%d=Yy?-t%ySawfa z#=iH)9F1vvD@OTKIZgvZ%oN%;mGB23b51CKxD!go#k&Fu+3W-_T1*ux3v|$gR8drfuTBL6?SO;mn(KbOdQtN>>yBZw4_%ixH^+}_} zV<0huIy{q@LUF9sq&L_&!Gd^{2-$T_p8^O{;wc@FJ0dmt9-ff4R$}|gEmgTPv%^%M z#aT416~sCGiN}dNLF7pyUnTMs5gIR9@zYe>*G`&IJVRt3mG;%~S$y~wo{%7Ip*q^6 zdMkSQmY}u=RF00DN=Un!Q@P^dDu=jdX%f9q^dn;(#4ek*0hv9`NdIj%l|iQ`QI2tKoXZwqDv27Jw!UQRw7-N1mDB`2r1i~y)bDn+^-HW=|WFcN!NIp{hTT8=nFUG`OUVl5%~a;qh_0zWyt@35^-n@a-z zxi4ljcea%0v5qFKh2UOVN`~ErYx+!rw^qEZWdYoHCwpN4Z$>sly$;w60V9=g1Tt$g zwr(?i5_Z6l4jDY-n@EBTN`9JInKw|QBRrm?mRbJXv0d}<#-}@5cCji+YQWamv5~T~hysyb|fhV0@8I;5)5FNM(m))p$ z1#x-$QO3Ke6QC$!T&PZ=u#P8O1G!^dj)2k4jFfGQs}XC$UG}N%@9?6lOn*`yBEGf8yOyKag^&R=|LXK%rhj5G}5Sy;o?+05_Sh3#d52PF0)uacD zwPttT8^3~HAJRkwJbATWjgG(m0`Einm&B!2)U+kj);l^_!NC)Qc}D#tjFx_UN1-toK#jwrI4y1L!&#x zJSil>a!4VQgn5hd54JafdKsr{dlakZi+O|M_XDs`qcrf$lu4-61$zwpwu%d)G6E%J z$^iZO3nD{;JKcR*PI7TX4#09U2%wJhf^W5L+;)>9G#!vAQ%^?$#m#Ic!@%)sePk$u- zw?T%d7ENeqsxLcU{B8QO=@(8 z$bvBM&)_Eq3BbTH>MNfCd`~af{5Z|mQuQTa!mFR7f%nrRnG{_jUWv#T2ZD8U`z@aE zF%U~Zlm`TxfB@pXeD)CIr=Eu!KdouT_$f=0#@}Sp*y%F7;23rW8ItA02_R1*nw2z1 zbdn(<_Je`jGDFHug>dth-Ow@XxKN$pq5rT9G7cI>8ii8QUZqgADSN$)cJqo#lKmSE zLSaZ*A6BMTFtN9_Irk_d!kPSo)Z+7Pp@a39=t`p<@T~i=*;JJ8Q;Z-f2C>3aA!3IS%I~{Jy4I@F`2^0Caxh{-Hqf@ ziH;=6-Dz^if}uAySG!$7An9c25_r|UC@n2EUz3+u@+L^;<}%q3@=m3ynto^$&2>2v-6@4uYsm&=6=uE&1!k2jwBxlHE2F)@DGNW6$U zeBH=o{ETmOGF_u?bj`YHFl~0Mu3fjg*?P9?)SYgwo|Am5tT5pt%^&JC*K2eW6>eS0(Lq7Q0LJrS5WlS<<=AO7}?pNcU*{sHEM_vF`Euaf#cQGe7g_{H0q`eUEY_$9x5JL8uF=PVKzboKh4kY{SN%n#7Xx?8 z+xigloq>uY2kUlYb=Of5F=|6(J zM}nvD&eKRg>OY3`W5I`!K8^G#|8b-r4?co)&9~mj)V%LQ9a=R79_N(nMirt^xj@tcRv$Iw+;{5AP)kN7qp@vf%LDYzXUf5TS zZr=|&jcq*bbP!R^-qrZ2`Ok0kRmX3H?cD(Nc(dz3O$`M~e&&|7XALYrvy)MkfiW=G%{MamZ>?koCPSuc=2~3sHdUwHYeel% zpu20tISdQGjKR)wz`AiE+-xd69C1m<`|Imys9MA~?`G6d{QJ{a&+UZjT&Le^cFuL$ zSI%wU#FX_uzWUs==fZXroZW7=t~NJ<@Z9xIr+ZFMNbpY3+KKvjt$8z0A>Qi+O|`mx zGhS}`TRRw)q=E9)6?W4_P%>QnRZVpqi5BjOYXPYjafjCt1erRYCGf88 zGYIEJkC;B_loJWQ9rMNg&EH25sh7CWu zxgErg3Zfm=YdILM;l((}o2;z&xWuB&*NvL7`@}^RG^4=ND^&NRIc%*=;_6)^US8`* zuXMLNK^MIX{GU;(uWC+ghe2muokVTw5eAPkc#Od*23$otIEGkUY00OO%@970y!T85 zwzK>A^w_RWmR*Z;Ndsf!o!Gw8ZiX#(Vt6q=?49zEK+isMVSBrC)5C5ZbuQ{lQbsW$ z0;}pnEN-hO@lSn(!P5**Gnnc?xyc%b?U!O7MzP&f2U&nL5sYt4@D^;&b}~K(ZcA<~-^HEBT?lepE;b@IX;4hIDfyaR zFXb!#5pXj;$!(Qoi&hx}N2~BZwrEkFxWd)4^##9z4SF;__Ql?4!;R0#?myD0Wb=79 z+Mr`Q(QtP*x$9O71rOH+wGp0|7*k=uZ50I#uD3?h_s@H;tb38#3C?&(dCVRa!cuFl zgHL(gW~AEhj4EM8Q&)GV6ScuKTFIyR_;S2>;BzhRj^V{P?~AxY0veG2Oau7jjg0yz zk~Ks91pk?OB{M#@5M%>l2@2zBd=d{sR#q@}y*Z<{IxIYYe{%;<5E(8(5*lgavxr2Q z$k;N$qKrY-H@yPCHQ+lz<{4Bt4B*|P_k zE$m$3`-WPMjC~HJeFq%>7^Xc0#_3z$%v*Uo zZ~Kb?9X6<&4eFK!b<2{vfjYUMQU%2F1R&qctQ&sOFWoNJ3ji9qt)fp#R$RCJ1;0v~ z_f4R_dKnY96j#3xXwne)nrt<%Va=zRle~Er*-Amec~7%CYz^=TkU6gp`UQrl7lWty z`-oJHA&T>!P5{$9)T<1`hJ_pb7c?0z$mKWsUvM-Qhb${jKL zsXjBv>@N&*1NRPy?zK-2EU>pKo-HPI0T+*DhMv*8X!B=JGx<9Xop-P&0Z@I$`CK9U<0zF#?e|{)EwZaSe0Noq5pYJ{n8f6BWac( zR(?6QiN=I}0{P|}4FUK7Whs~<)%t6=h13-Cj^!96kFJD_J0O%uy}x9Y5HsHe+ye1* z?ys0tbN58LnP1i7(TLSP&E^VQC3hZP6a3*kaxpWR5UXL!=s_0R<4Q31xfX1!Eb|U! zMRJZ~qZ0N2>x?p@rpQ?(U6#(vU<+*g8Pq)F57H1>^^PV4BYA^6eTqfKIY>V@AW^=H z8bqF}8cXOe*yHg7eNBqiZ0xYLT4fe*& zRb=uCw-Fp*37RRa%wP&xVF#Kq6cff!5yn8+OU6(Mih#X2$ey;SaO`?nt8MBi2llXl z{dOc?x!CXQbbGHSiVwYjnwLOibX3!H!MI*aiV8`jlB6AW6PSB>iUoM>(8FfLV)Q*g z+OK#&<-K{qd&ax6>RsS75>=@IqO2zx4x#Jqi14@7@A+eF0`FhI`(M4#cSH@a37WJdiPIeNIDHJ;uVI_2{6lowK@t zN9HMlxz|L0yUnWs#;JL|+3qy2bb=9|n}Yg~I9bo9U;kow#_NMYsT=JO;zMvfP&b*4 znZonU=Jfzz?rCow{l?NnMN7py}VyRh_| zVvhKBBEM;HundVV1qaSrT-J;ms?MelI3QR7Xf|`#)R2KhSwn^s>LTMJ52Wn*^UR~L ztI7;S$ut5H>SgB1OiaQCMNXzDyQohy5MJ>a#s~tGe3a9sM(ChcoCyr%^Hb;`7j28y zPQAh!USJ@MW)dCzHu6KV);_N^r@Q?mPD>y4XhP{c3jiu~{TP70;LP z72r^gu%GCrCUnZ|=}@|SkEn%)GJpEZ{5eM1*EKrA$U_!{AgoAe-MFgyt5X80*pbk*D*`#{1 z*M$D?Mx1T+w{JFTt{!@JLw%8jIl|QEOEqyp5PgFcs1-iW1Nf<^(e~e&f*~@o{{cnb z<6IDe5tJ2h>p`uWkX@gcIhYg6bVBVyb;fY?LgEc9`v+sMtU7^>ZQU~V>>i}`M3Vmy z-qnZ=V?D@@@YvnlTs)>^2~q;bcs_}Lv7@=B?3#EX;fO;9tzJbD^%?_NkLjXh)Bgx} z$oT*iwOlSUvjE&9K<&|)@13YIvE5@DrrGAyL&Yf_=T6-vjIaR^}*FI9JUt zK^=7s##Fp)iUwy*Xfjmn=IXPewShwZGG5iBn&Nbve_&v&YL43A2Z-}x3#&ebg0U6ubcYB*V4;*e&Ehl0SR=7EpNic7 z3BQXpgng9xM#XfT)W}%NM2PW>41+ z3lkhCmIh1VA4ZUbd`HGC9s55%IQF%ihOe?02tLV>>B zz}(bZH;24+Xj%MPLJ{!V*F3sjw84ZS;P<@w%MK>R5Q{w+XR=_@!|9#;tpRxAWF~UlQ0@0=1A>14x`Ow`yNV){ok>c z?kU#P!aZ?Oj^Gsz5FCKiHY{W&U~y_4vuPa@Xszx44#0Fav6Ph%qou4WjIZohs1U{b z^aCgoP{ToI5oKWe2dt(q+c>N~8ZUnSb69#v)rT#}*Qs>$Ya%gBeGbyeO{oo9Mm4g7 zLNf_>Hu;AmO{m4anUP?2D zbEpN73n0o`HA*F9i?YV>qB3bDd;A^}VvW`k2xW>)13o6AY+ew}746ejWCPyiA$L&8 zoELR9peIt+)vP*0)o~+-r;;O2-CjXTTS;kHmQlvV8<{;?uc1&zm|Mvl%M9}HVufZ5 zFs>NoA>kANS&M*@&_0T-dMMr4YwE9{z2Bu+a|tTjpTmDe_M`R~{5%7JFH{!C`Ozkf zONHT8 z4W!{f8(F}Aq6znYMNG(uB_4Um6;uLI<>Ue8HWH>%2}PjTgj7OR_ed%^g&~_OmE3$- zsYG=CB9p-3*zy@mQk+oSS;^-jVZMW@l|06EY~9+h;Qd9_Gh7>h&7o3GWEDs^SQxN@ zKvvnr0?@Kb0ZU59@*=AsEwV}hvXGWlXu*J_lgKJXxHeE$d5yA4RK%h!LH5dB&xBtW ztWP%!L;Wx0LH1I==3A&Ivn7%VmS3Uw6rR|4@}03)dQ0pHVlV673eUpwHSsouaD7&O`3!FJ<2B0&?vcG~8JgGSVa?2O`H>Tv4 z>Xh7~wf``erA`<>4mXZz_X`RzZ)&ka{1Og0Tfp6K8fl&d#5_BdIe_=abKc-U?kdWT z=h9_l;7P6kRRnJ$>#J{$yhz@J7s&&w;k*3g_C)1UW7;}6c|$+ai^h8J|I-u2!`TW7 zH_b>pK!W=9`cXRF@VQA*xNc}W9G!d`^H{k7*>1oN@u!>+0Ve!ZaLDjbKNaycftf)>0+_Y;b0e=5mv9}h zz5tyty+;ZH{Y&D<0;f}X2I%xPbb%{}L;49h}=FwCb8S_D=NnCx4fiT#Y89T*5L>|$Nij&Mq z=3QVwkt21ceF(3ZxbP(^t>k#ul>h7;4;g8cd`y}m&7;J5xOdF|AaiF23PnfMu2j5= z_pDVBB`n3HNgrDJ$CQ9!a7+ov2mUeU$Uo-f0WLa2K$`0kqrQZ8@W8}*Hv7v=Onang zY$wk}*}cM&0`-nECc#Jr{UL~2s)797#}Bq^LL)rkXF`b_tX~_E{w^X|o*Cr~48Zy- z?Sk;w;V~L)7y4+!qXfdi30MQM8M%qFFbvUA%e+HR9&9M}jFdI6+Ul7IMjiN~IPgWu z!m9@xHGBzSQG}uMkg^d!mgyM-XAf>>*sSmnmlLK)^I4h$3*%iJiKGNQQoV&Dl|yWn z#lViJ3Ii_5xBxGMsJ*e%$5~QY7&Sf;2Q67FBt@zBOD%EGzQR{4n#SQfR6llP7UGJq zAb2d`C=*UeDk8f$m)KhlcYisHnGp|QS#wxFSYtrU^v|NO4MPF2;shR)z)0&WNAUW4U9+!Qb_d<=05YD&31jpj4 zBAv5vho3^AJu9&x;9fOu1_zHYeHhBO?ENf4SW6M+e4aal#dJv3??RRIw^7R_I3kLq z@<}A_x?^4k?R?6DLrRh){PoE8UKr zh8*9Lw#HU#dq<3m7@#LnPi(IDraSOk$bTDOe2;1-9t^W_9PWY5mXEZsF-8{~OVUS; z)qBIa-GYv&4W;CFj9)Qe=SGvqbEFG`ANGyN-pbw~VKi}=X4EN1cglp}g%hm}rNTH# zT<==CS9TKzK4a%9ESwwRbpQ5I@Cy6qBC-U-;1YZI#DjVm^(Ur;gOkI35%+r}iD>Bo z2SUL>a(Kh%aewpMSvkUyRgcbaBYX-qOt}%>wbsvXwP(qy6 zkhjUMEg?d0K=?|IdjmfGpiZV_kdA@++XuV@&+*Ib|6Ge^oRz(#-F~^A)II|?Z@zlq zZ{g`HXnaChK{;WL)qk3<&NghCKQN&lC@qz;RY$Ro7Ve3QL&GclB?J?c)dGDrc;FAz z^*EFQq^?ja;&u}%o5#_=0&e`q0Dpl1bhm^Uj{%m0JiG(*PzmJMj|xTQ4|Kr5DRCMz z-MXf_{6P*-UHwBG^|GA$;L+w2@llzN1DYCWW@PARYQj&W6RA)j%r{=RhC^4^QW_gC zT%FFiKAF*?xnOuPKBB_6kr^kTP~?zL#Y0M!9Qp4>j*xmGM^QVRUzzL_Ks{Dvhtf;c^^fjQlSei4#@*YkZ9ZQ z+u2!h_kHNtIOGSW@4+-YBVd4SHz zO_j?@x!hPe$r&DpOZfginkOr1idl@UhdV4XHA;MKpQA)RuGs`xMZzvl%o(dgxP9>gqw4oiV)xXXj|{1_t5vw<5Sm^HuUMEqdnfdoylSQ%_bwlB2k4cw z&)DIF`u9=f21n)+B5np++7Y2<%>I+Zb-CvE;hz3IvuOTrvsC$$l2^)A3yO6fAl&r8 z<2CU+6hg8Ii46h8N5{xEeNk+j)V4{o6LJm04U(N5lI-;*A*EQrJ|H>(4eP;AZG}RjzlnyA(dln7`4R)-mKL|(@M4^|g*$u^!H_ci zDCC!FWkI&I zbZ;fny*(k_^ZYl?5Qub75|37uEB~rSZ~M&qt+!19?9O zd0$KL0d=e+liq3?tw_#eZ3pbq5M) z)xV+T{>m)5U;RxMmSvm@{IcA+s$-kyxV18?$buIT)RHB5g&Fd#UuI0~YTssz+g_1} z94_X^n4Fnn{!w7unPR>sp9d##NIr9ktS|JF%Mhl*XV{K249+qT(mB^}?}L-he}PK> z4LkAzA|Rb(7#D=|d*g)jp!R@-v+}*tN-4YKDArW_DEz9gv=l1L=-(D{@S_QEUDdy^ zk!7~vbRquFB3WN*zk}|>jdXQ=r`M9>ob~)Dswk$_;eCSNCR*+FR;i`rArYQHhN@{SdE93-pn~ z3g0;~>m6NafzONbT)V@nDsuGnM~oe3aDu_}41NPa?7Sww@1$O2&g%@;8Bk#(Ivo)| zgq6wkj7euBtQos6=pTvtl!3RxRRq<{al?iAw}c-`;~$K@NV`_mt*)#*vGNnuuU0oR OzpzrOM%C4om;MiBK0F-& literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f528cfb8c47154b1c9f5875a8903c014cc2c987 GIT binary patch literal 12132 zcmcIqOK{vqdd6!o4-Ux@Maz<(uxzh2YicN2UT;bzd;MCw?ATn#(#DlB9T=R(kRWG( zQ3J!5F||pot#uAMB$e7dxVDNs>YRH{Ip&@!T$3DpNu_EJsaijn@9zdMm?34#Ta{s< z33N9ae|P_n@BjY}W@l>(evkj|-`D@`6-D_EdMN(Ncz6|8I6%P^ruLM++E)8oTT|&- z>*;-?ZHRllS8A6~r}xZ$xm`}{D(%wS3Nu*gj>1ZAd9(Vt-mbZ&&6&+QGx2|xmEAd3 zp}Vfhs;qX$_*`wzGySI0ocRi0Rhn8-x-}g1+@$i0LF5I#(`%~9e8=f*xXd2#VBqp- zZ>2+hi$5_lRD{1(6~$HBDpT4TQ`lHQEQN7-X|bK(AT!HjF{arOjyojs-3 zlgvV|#Z!7c#V(@PL+oj+r=%raHGHYUo?*|T_1te1Luo&Z9eyrpeCTev9op8LP9K}h z_y75|!JxNi^#kU0J(pQL>tKb{!}HfIC$yLwcDT2OcF(sizSiluc-M)j^*x8LyEQBQ zz32GrLucK!-Vd1Dv)*(&7#O{FylBI^IdD6}9$wpaE%AM{T+CnHh@wGweRW`e!XXwnXo@l9@2#UMI3aw%gGPd`S0n>BNdRP(;e-%7J>Q zs7j=5>ak)Q_+P?*GgeWS8I={Lp{$Cs#;Df})39L1{(ry0T_-iHcMS;HS)<+v|wBd3q;FcdmRuoteqHe(ZR=D8|+-2r&dma3X_6Dv6IqPhN6NAx! zY{+RHvrTDPw>G@c>NpYlv#6# zs?t}!g77LUoXTcBDTPB2npAAt^S#KnlTsA$&PKc3?FCMB?S*6p`d}@z1>x5w&{`k@IFyzaj}TKVc3-0`;(le^K7`zsS{OYB<0^#+amCW=_ugvK8#VfkII|k3`>yPlIz#4M7aMvHnqBNvVoHWU@EkAEx3_Djv#{r1JF@;!9vZl12 z{Mi_)0AjDjdTboRj^bIHde*0&jac0*G4+Ciw~RwwyhEykM$lKaQE6milyY2(&De;w zLt|8q%UFBOX@CHs~X5|Qf_d@0~UHA5YZ2XcI7$xV=Bh6OG6jxRdy z1U!tOc|zCPfmVZV?DbMIQW_mpO7dN{WO)%(k_D>(_K#ebKbG`kyD(N6o(rj`oz8wgfNu(G{h_dPQ@AJ?MuTB$ zze#IPlbsO@cyfZ4T$&(@$b+K1d=;E!4Si>=N18UBFZf3;#+uu4hR{_~Edpr;!%#LI z8*B&KAk$la@(Nhb3z3>+z-%SII>N76;j=JF$pxD`Ug+9h7x*ewgtl1glbfXRethyR zKZNFR{m7O!Jt;I}t?i%-ur>5qXbX7;q$9+cT(qSMfZY(V-_9+PL^?21$(-F0*w>bO z2C4-dcmkyJC#kTgcnU>Q$z(h^3m8i3D?@U$$L+@Su1_P0GQjYT!`U(h3tvQGK3@TR zFpetPxU6d$fQ0d#VbFg-i*F56-l6sz!`wgrVdlF?v&&oQupI+KWX;F#ILv2)vjJU#7rrk~JQCE%2akWc6gYMh`eEuK> z_}K!Vz|0mjy)ev~o-3G-6J_T$D>b$ezW#(UK z)`^djZO(RYlIMwMPn7(vPbMViDr7jvH<|njWkVvIa)VVga7!enF!K=0F&fB31ek)9 zf2zsuCy)<99H)2+f)_Kw#0h^l1m=AXwaKhaI5V4gvM3&NYEuAO=B^Z%y4@f4;Oz!| zcumb(K@cnqCS{;DHmZrbmDI-elmD2$I6pNyf1X~Qgw*^J>O#6MqNty#=wR(5gAnvl z9hV`VQHOi&sG{#*JI3;-3-*LG%;7r(mfxU{{u7F#G>~HfzK65H)VpvynD%R2T?AQ1 zWzkm3+C~+?<8oYy^{$TADgg5MsfNA=$qzudd_%u$jAkPBz>I5g^-*$Q$`l$(N7Ers z^>`+(Zz-eMcos5cIwKu4N>^NPci#bA+D(y?#8MI#ORFomCu~{obaPg{#fLyUumWT5 z>Nq~JVa5eZmfM4&0A39@f?*F3dmPson}vM9w+WV6*Fnj~^-phKS_R(uY=59D`*=T6B@&gz3XF~*=kv0T_=Mo zt9{R3?YZ6P5-o0*JDtx?TlZ1^@_P0I}9*o!BBR{lg8S;fHV)n5`xTlI3zzV+rCDBxcZj}nQA~dRqA|xg` zdF!s|+BR^{_ze0dW$c{SANCWYJM8tEb0=v~{sMaPYt&t}+p|)gdSJVb8@A_&PeR}% zJOZD#7ZLYCzzZ-#;$YF-ekiXkdRy)K(Dx|Z2ISWe3wrC_V8C$S5a?)f#XhP}V zm!$7g@cDq>r|3^r+`G~$TC1Vgw=Uhjbet6HvNbrNx@kW@*1a;?lp3*1-AlVoG=EBb zz}zmFiy@kGDk<4}2!luqvf6icFI`!(#CW66Rzt)?cCE?UF%36e*UAzV>tMOzTB{TE zMGW~eq==F$GN3$woXGzQxd7 z>MxKv5ncho{*$!-L|CUF87o2Lk}Hn26nYm1jS_u=Sls&7+aLT=KnTpZAPaD$!FoBV z`Lv2)U(O$KJ!#4SvIsUwEwVQm+R1Vn5XEualOa_R!pvS?+q=B)^1yOAM+6d9GO%`p z4*^Zm>$dE$4^85Rh#m(B@B^IL+fEM(B?@x7obA9vd=p45&7UA;!2>pgg~4!uIgEQ< zk)WY@6=yLvgm5TySTjxt9XSV`d3|XI&4L{)ArH|XBrT z8@NL93nGe^E1TLO8~}iD)%h=Uk(?&>JzH=qTX0J*IClqdoCqFKL?(?>6Bk2JDaAEj zOTlpRC2*e5z{35@l`Vl3XJR8{7%KxVsa481IGP4uNMZyhp#be(=s!oX*a$+7bSkmW11(QZIhHBSsx67s-N_^P zKrzVMi6t8^V^Jx&erw0u@&Ir=M3nU$@Ghfc|IW{A?V`)n}$pZ zm7rG9Oc1VZz<%!m1&Sch6c_#s9sZZthElI{_5J2??#otVN;)O}B3F1Ke?##x^4Oe8TQK;a0nW{I|>Pu7g;GtT~^C zR|x`7rq)f_GHo=P7`6nV5~0@0Y)x(Gi|RRjPMuQ?y)Jy3%D0O~#i*%^N9RmKZ5-7N z1Z0mE^mFQh3iI;haRHwQ7EcH>C7JVx>u>P@fue{jQl}7Ga(EC^g&^vpwlmvMjPT^y5&>t8|*#N{uw-{UwF`pw2Qw7n$y&BZgg zzaj1`VyxH2eHEe&1rokPPa<8euk4?Z-sCuT_wxSP6qd;Vx=4t<+|;D4CvIrY7o6G2 zxOn?a>V}R(gKcq$A*~iRG4C|~XuBb^2h*<=a|-Q-B(>0Ip5mNEcR1}*82Ay{EF_Ua z^J%n@GJ$Pz4r8i)nsc7Ugs&6lP!_xcHAtObD z6UO?(V<`R>s`ZoV2$BCy9F5d+G{y`w{dKW)MeCwAPiRB({!|d6G*c z4n$OMth>IuJK(SU71rnvp@)b0H3?lh2a6>Q_I{RMfzML#4DW*#^<L?;p<(g)G_k^+E`eC!=w0{$62AUFG#6cmkrtmFr z_GSwxC#*_2hPR;s)0I%Okk=pzCi*-&u1Whmlkd8go>>r-?$ zgo&%*rWTpZQq7RkhYX< zn*|+I8x*J&2gwqhT+qjfHuB&S^Oo@6`4ZL8W{H##Z_yq3dYo1%=yz) zJVOO-Ag6gD#U)R};jCuQwhL1y-jcgb0!PQ$!hbg;o8|_*{)n>;vg|U7zU8~U-&YxIYKe2XJE6}Kw z7QW^*7ydpKm#HAxId;U31Xr-&05`%}F!<9kB#SA4^yz=fU>!hDY(fE%#ECl`06WwV zv_s`Ur}Lp`qZ)0bM8*N~T7c>%La^!x4g(H@kxN1u;?iAJ^j8llGi2mVAihma>i zC3YJj_;uh~oZib9BT;V`wqMBkReUfe&R@h$rDW+9q!j4@BPA9a`Xef*9?eE#!ea-h zm%K>IIjMGsekTG0hk_X;RTXhEwB`GW=KExKa89(f=|ze8zK{j6J#Ago&{5(+#8)vG x|CkE;RDnPtq(o&>xgjO#6WJ87|!)Ur^cXsEp zKHV#6m)StEatr|*LPCXv1hgR`>?==DQ27UVOBDsxPZSgd=79o~n{N`o-|6YO?L|tc zqPO~+)2Gilefm4!<@bH3x1Gzm8jd@@{txpTuV~u8@M7>7$HkL4gZqW1y{vg!N%M41 zH1v{=U!x(KM#*TJC9`RjtfpPEo2gPtkJ~sUN3~5i-IA;7%toe}EoD)*d}lG|*`zho zOL@*!vy>mHO?xhC$3Cff883TP^Rm9XIKE+&CQy^}@~FxC6S$tl^_Vw~>v46xFFlc- z^iF&G{3&k==YD^`H;rH0-(PdR1Kz=_R%yE1<`C)+Z)-PI%X&w=qiA=)y92%N#L1C9 zfcYNm&f4_~-Z9MYkoSal7w&U((>w0njb4W_qkFs)n9&I}qa$^#bksL}+n=b3Lz;hw zckd}JT}%t_N$)<4`yrh7mg9zzbzx<>;YZfXEp#mEkCbqI%pD!D-V{$m4~)8xeu4`yfV8I$k|4_ zT4~HS>KA91SHp#N>*1LPAD9J8{FBR->QZIi4`wel8qL{e+w&V`|B7E-!N{|X%BnAe zS*e&<_7l#{EU!k{GHXM>-16I3j^Rmx4RTmNtWW83KZ?8a=V{!H`Qw2*uPF(O`QjM^ zH6qj&^&US=&w4|wqu({**_%2zlbV~684UFn7edm8$Sf~!fLldFj-d=Tg?PUxRclnj z&~M2Jwjy6*GQ~E%H<@1!7f+xjARj07NxX@x%fq;+;v72oARfUFKzt3BNGl0XFBzWj zP0zq-;)Di4Z!FrDTK09VrhBv}*^-SrDM-3QiSSZ>deMd6(!!XfO!rzS`(l=|-Wc!0 zu8>YBn(}k}<;f9hjyLJ;Q`YNt+Cs-^(_Yt1tuzikPDf+0mh?1Z?e3RkeZEd@iI={x z+}EGJ6h!F-zv7jH`kEiPT^WINMhBXeE9Im|UuuHLyjThRC|3`Z(3FF4wc*P&I5=0O z@FYhH1u7GjDmVucY8%=$eN)q+8Xc_+B>^=$2B_IFL9LDjYC||uo7%eFv9CE{`kK3` zZR)tv7Bf&e8!BbIVZiWfcxiJk%2nF|Cf_PoE7b-66VPyX8PZ-4AShum3)xx0b>x)G z^;SJBm*r7RMczSjC&>p$3Lw#Rx!P_nuRs}whKvMsNQvRZydRc5zgAgk;5S@d_Df^Z zpXZ}w`Ql2g=F4I#G6TO++x;n)nkP&^x?HX{DnWoLDN&{r%i|yq;tVE1T*uavKS$&t z$ZH2j)_5j)ttt&ma**Yeer3n*MFEV{sfQj|z6b`QY``*s>5Jlswof~vtz$W8r`GO# zCT2n*T<{C^)+O-LFVv*}>Wbg0uFm{h{EYv4LMnZw=Hgu}#f_pF^U1ErN^9O%OpdHt zqg@H1$()DccI05#GKU=1;fdY~BrVRMQ$RI!GFjccY3j~RQ>-1^#)V|%l72&GE0ILS z4oV^2}6Jf(n(ESAbv^)Pm2*lX&nu>H7VPg_EZX^~JuaE4uO_+>0fVf+lB4lyp9TQe^w} zR=Yx zf}YdfqyQSKjj=}?qeHH?@3yzueyApeE3>1+sXHA9 zGGM~Oz)5{htf$vq+_PoaNncBOHvY^_NP(Av+@w41wmP>p)k!a=lM(tYx7Eetz7e`D zJ0(oIvHwlu}rj&wg zs)8HWEyLuN99F7Qsw>-ly(y*9jgY9;8qpp&k?y;H0S^hNw8vA&^+~8v7AiEUyC~WE zlrgPOi7BYo6ihaiY)UtU2^8W&)pju9-9d*mh}zUM;UC0B2;T|6i8X^v(u^yswQ0kf zcGq!B2MCrlcBOH?f?Vx;?RW*qe8qKGYGsax**5yRsqVyHN4J2J$vX~=up`zEjGBsW z%i;tbZL8As%jL)|mjOgq8Z2ka~z!U9uLwP<){2*!bx{l<6 z-wN6?Hjgx2l14>7NkaG;<+?t!>NgsRA5|9bJ~mMv^$1EcID=1sIGU*o9yf#`-WR5L zPgvqzVT*S}D*yJ;edA*yW8$ufyC&}DE%#kf9IlysS8~iM&Y^>X&g1$V$gRPoZ+niH zuH_Y=biE7!B>*PE!Zh?Z>*WCMTm|kj(9nEz^u>D1ugLQV01)(yuz&4WM=A>b6~G_Q z_X?Nm;X(mj(6lHSRsvrZW>;pHD)sgfhq$B~4`}cR zYRY#*9Gvy+TDAxvJa@>wR#8p_xxE67lQ~Q(Z?zH zu+UtzI;6J35(?ce8h}?uXu@AmdxHR=)`KTAVZ4y9yVP*&W0ZuZoM0J_&%B`Y?{@x z);J8>0DgFQZ@wlFZ3s+kWhGIWB+ir1l2A?MCrO?oQLjP)hz+Gcz-r8B=#oNNk|LkB+vd6CKEI)<2P{6{ORamcjb?(Bac2PpPw-R>y7cjtW zYhplEyoj*>BabOKi7UU^UaL16mA%z6R);t!qwXaK%RH%zl7>oYn|z6-vm`rc&<|Mu z8IZN&t$47r_HY_K?fEh;pfL)6Lt_*I#5H^y5D@B*s5_p1P&>#{jMcw-vM1*493@sooMv?|J46$UaS@=Z+xt>D13=9v^k>%NF53~o%nwpC- z3j>)QbuAkMaiw7@R#c&*0_?Y;aOqg#&U19to2vJ7Y24qTLr?(8TBe={fDnv=K${1v7vI79vIgrT=2UhFxiYg^36UdfwC5wSTnB>r zEJhl6?7&-0;cj%Oyjqd1a;@I-5L!yZmCB5 zGG8rn7V7g0ELADL$&o0LdY>mz%eM(3nXxO|Fd>~5>I~F%-J7a|mG2w~8tG1vHUml3~V^L6;+w^B!f! zk)4^LV)g&PNXo98mWs0x6$-_WgNCHPlOBmrLNGP7yjV^Hxp z1Bk#&ya6Ubt)cW(Xs9d^05WY1tf;bJg*7+--qwHr@Y5I1J`w99{jk!}ZZ=Rw#|R-B zmF6;d)t{JR1ZP*u#DJ$#=l;198?WwmJ8s>HIgp!Q-!=Cft<)MXt%qw^h)=gTZqQ_VuhlU2eNupHrsTtufT&Y^?R7?0qKAj7znxiqGzzm+68 zv4MhQr-L(%gr^IPn!%aHnZudKiS3cvw8E?t-sIIhFe_V)Dcov$`@AXQ*6y|Nuw8=d zjKZ;5Ebf8m@MkOM=5}$QwY?oEdls5L65rl-2z%CV%@_QzP-)B*=7D)vF3!N%%`PJ7 z3tET{{8?Ip*lAMB-YS4>nb8!GvaBq(ky)l?h(Td4r zYUNbu07a)0Gc$ssky%oHYp2;LU`2t6>{yukSi6{sz1H4Xd#6F~!+5dO()=pmo}-MC z<90sDWeP!)n->a=-?h6y^wGp%a+7Q%)0DQoi2_-|O_w+?OcRguo+l1-d^M)RB*@ULr(m{_YdHu^ z0zZ~gIhzZ;ay2{*9Yt_7!2P(f23 z{vUt#k9G~7(&DGW6+aOf@new{KN7k8^{GP#Cnd*1^5@)3XP+&fdH$IT7y7m`$_{QS z$pTwuWqa8#)1M)+F|>okHJ}zLO@A7*Bd|!)1)jE|%dt=czQ>-@hTg*FUCm}TJ`|Ub zD+Knk55e04acL`>JP|_2linNFda4z|URY>3OH6i1YZ>_E)~dc(%OM{qBV zdn+Lf{m8w0aL>iPuO-ijKU|34^$gy2JrDcDUO8AFWRGihUF+n(tG}sZTp`N|wzC-7vDYiv2rHYFhodpVU+yL(?r>be;K7rOPH-D*4?qA~Ewoq%TGKr2&DaEJyAs51#3u5T+EF4X zzW&T9wWA)Kp6Pda=kUv|rB?fLt8lJ;_Sqy-FOX4%Vg_6F@#h5CbZ-VS;Cz@3jI$Fb z`1t)VDlU#-8I@#EAyijCUa2%z{AZsHL1f6mR2WrYX@!KCy0O)fTmU zv5TjTmRCqt*?Rwu>l0;_nhtSuIp7jaFP5u29ggilDi7(5z7|m1(0V>BdM1 zg}GV9^n|J*FiC3Csz&YEcH6i#O)xnozEMeSn!1K-q(qd7jDd{+icElvvx#q~)U{hP z@ZJcmPvXhI7BsXa*KaO|tK~+0$(P4ap#r^lJ0aD5-_pV4#Ss$ku)k2?a9!{?%scrN zj`#;8UnTh(NGTWZ$RvYjxG9rd$#njwl0CKwIZ-l?paC){

pyjspCOv->#;GR!HTI|eDlYK+1R`iO39 z#*f;v_-M}b)4t&&=YgNbpc9?%W z#0~+=_qmXHjBn@SfTg=_aOaM8edid3s?Jw8a0b^v;%{G&`=@8??%y#Y>6T4KB*=Xs zqltKeu}ZSnYc3gbMzyk3X4Fs-2#o+rXlJ*ON5cL=>KbrM$6P}1ZrNBu)Pr{EBgjE? zZ1h9q6JIm?#E4~+nNqdWfG=rOo^2ov<;MI~;8C%=xAbwM;&XVdc$?0;yx>b;Em5+= zK+!3s6M!F-#(R+BGPZNE>rk5LV~V88VS!r`B|B)g+u=ef-ST535{xFMIjz6pN^u3T z8G*k%!pc1SjD6EI;OCVku;HD?S-uJ1|Gw$2o!Dg+M`M!hK>L$?-{(k_ZBQbmYy*`( z-nmrvLNRPtpiMpd9_P(wG!;9+Ycmy_3NZw+^FO+GO6ln~!$E0%66f_7LHdTn6IV5S z%ZBgR_*sLZFdit|iXzp6GW{cdd8)6_vLn9nuCD2PagY7S9Z}s&oX}Aa<+ikWWca^> zFFk~ccO={H zDv0vyxC>x)qC5Fp?M&<{K=K^1pp2%p3oT$;)q}u=c=hK6y{E zh!-i1@rL5kL1B=8%Beg>a*bq@WMpPfqwe21v+tm=Hha4>OBPlB+3=YCSN#R9_~Q?A=Ar00F&;Le{kpI zCA_t~O!8|a9g=mDHIN;p?>z5*k>uA&lmh%luhiFvtEf8;pCDgDVeL5MS+5(Z?H2CK zFQF+=%j+EOw@AKELVT`FoYE@ErIlX>!S`l#Y4P{N43T$=)#dN-#_y6S*!l)bZ<45~ z_U0RnkIvA-s>bKchlLsUXmPHa!dy*`HQlfWZwSS6E7cr_Zx#jYI*e1(p^`lz|CQipI+2N~^_h^0?8A&%B;vtr3J$0W)G?v z?)xH#+V_TP`8Ap8zL(D-{KOu;dvnSix2H}a=F`o0wzWi5m{;GOxo?l39zT{niB>nW O<9#|ceyg7OxBnLs;>9alA_GG*yzOLDRgN-6lor07d@>=#Lit)BY=vKLzG*fT9liBWTedO>AwybM6e6 zyONA{7jt>%-gBSl+;hHj?)39{TfyJa@BHi9-!v5E7xXguH-(pTc>GmeQC!7QT-DV& zYFBf#uI}hv!!f!kC#6!k-Z34Mf74ydvAE3WWV*Ivce75G-&37jH}B-T1*gF8W@oBf zbc)?+XPV#Bol#U2$_>-py04X}Se>>Zak`?-sdc+B@!+@Z`M|y)%urs5C8i z#w~;H0e6<^CVQB3=TY-uT(iLSi<}w0S6Qb0637Xz{!(_kQ;{ zC?1X}9++rtx(~V!q3#j)VNb_Ab?^A1;uV{i=c3|15>xKgaJ{U>dLMPGX#410+dhWA zs_5e*AKHiIKJK1CACG-#A4{I*9rxyFTzC0H`?KAX?h5+*NJh`-?s@kKralEOJn0#C zt`Pk_#pP!EabB_9Q`lyn37L--wVrq+WjcTU5e*Nw9ib8H?eOl4?2%( z@Vb`ee$x2ZdmaC@+Hon>LS{bZo<;vB@7XJj_;Iu~ycu4}v&fgRzPvZvwB6_2b7;Nd z>aQx*3!BfLuXO4+JW&a*)Pss&?+!Yij~B1f^R5S#pug_*{7O^wyIk1qyI$uJztSHD zgJIw=y+^0FYDC7xRwIav7h8T1SuYQQR=-#8M5)(?={%{bJ6p}*uUR9$^ z%dhpkq3HLZ;r9RK;#dD$$aspQLd9vW>gXBGExKTsYvAA+U{%W1n&#U|Q*})@eKYTv zOksKHw&i9xwLytDqNQJmMb^zhP%{|~w9_$_Y?1eDD62jCl$V7d*{q~ePVQ~RP@Fu( zp%kU#4X9tx6h-F$AJSmk75u$+rRB#$<@RG?YW6#w{xxv#875g-s$6XOjk<7Is;;zL z*XvdMpdK(iB(W|af&HK|^gXxI>O7Ra(7@S8rUYfZ^bLb%`52u~hj4UWb9l)SdoXD}a<^m;;h8$xkg+&JPBi8|cLI zNgf`@iIAdkqf+m=72yrMphZ$&8F&~uXnDSD`ef2H@VWy})Po@whLw{fONyt)6#c=8 z0e2T5w_XXvu-B+lr;_mWQbls)?vwm2PheiWRzDWo${I;(A5e{xa{OCzb{`#3F{M;PEXawr;AHR?s%3 ziUUP5|0ut2u2IJ8u6nKuHbsSTwzk@;`;8PjO8!h_3_PE}E~Dlo{kqDRH5WW+>tEK$B8^gE z>QzO|V<1?9RO@=PcF8dz;9gybG)ma@GoZLr5 z+1i>H)MVXbDDdZySVloDYMN&0mTIZ^HRe=9-JFqU1e>_mCYny(QW1O8!Vf zXrjDfB^4W{v=w27+Q{tX18i8RZ)qbfRJJwb)zAo2JI2Ti^|tOWkobirZI6HKlge9a zXts^lwNZLx1*s4;XlVxKb~@BU3w88lc8u4JQD$V1vZGvRi%WqG%1k?pCx<7Gr+{Z_ z%fu+=b`kmMc8S37EpnBg{M)Xi?`pv#HXV2;b?dCZ0OBIczB3++R$aZ4ZN zkvqJvq!1hl^I-vV)JIdnQObo=TRLW698Cwu!YRBL!|5#pwjp)-J!}je4Bc&Y-`mAs zRTIRGDCGuZUhfl!=&f8EKyi5oju{AKAm655VAUdbOw&PY1fOwvI6ewSi;uk1v1M0Z(F@ZmUPN z{NqO{=#0jdDDC%~LAQQAN;NzEdf;Rs|FxtnEqB9<@`;#BKr>rg9kx1dTq|mF$<>lP zdwlFF*evk~C3FDAgOogkBvPfmbczXgY7&CAV}%5ZWqtchME$X3X*+6nqCwr zxkPpO*t#@4fZW=|vX2l6X+1H6WEzi8omq;ZYDMF=hIN{nc3aa5x3ygHHZYwD+x)_8T)@`XBB`s$J$f>COh3*yNA$WIkB?Rgi(6gM$qnsJiN+ z0%JiXnhVX>)^{}M0{yc12bc(a8jPve2%I_j>$&pG!795r zt^n#2)7h`;#R>HCRXl7NO9q$?HgRc$(?iY(8|ldE4WSXd4Syecpk<5|`D=Iyun1Dw zt~#$YHM~KIw{@sdeWZu_wn58;9ZZGFj=G~mvBCnz_Dzkg8|Ykp-542AhDMW?Z(J6S zqVIP#Ry^V)@^ULo=DU;H^hDo}j84C|#7~m z4sGzdj)vYzr6o!0%DjfIn+QNIr#ML{dnT_?x%OdKre%(!^(I z2s+~X*q%iJ89`?IH<5AG5txj9QQOKz<&9G?0|wdlwn|1wJn7P&8K^lnN{7G2Ni zwJ^PiJso926Lsm49ok_gv|xg)jpHzRpgfDmHGn9zjq*l1)B+TQ9xF+YNkYgW)yCw0<-Vl>yK`VJjkws>wBUij%6o)^hzw?|3~H z(Q1~rf;FI1h&vJeYoNOB%r~*~?ww``${1Z!KT=@-$?wg}Dz>gZHLz6*)HOep;O5Ooq_V6GfK`04O z6S{=5NI?p+7nCXGtPEf4_Q;r9)JaubRwTmh=d2#lWivQU|B5A;92?Gene=oPPs(@p(%2?}?xTD39Vf z3CE34xW>Zi|%qt#R@w3J3KzAH#=pj8f=cVJPSH-E3tRCoGKG5i8fLP zk-EN5m+d>BzXT0&g&?4--_pPxoix%mc5Vk6ZX1448@UBS0?1zS4S4OYQ_3dZE85xI$*6e z*&Ak&1xIsqmhqc+)3>zkLWr3`cJ%Ej%C(D-5+lsG7PPR97MLAMY&U~dO)F)2r$#h<~FNvN>EVMV<^h@C(?bpa9TN_6Zw;`4YG`~X<6R$+Cawbo z!Zi+nWuf04D^dK3+No2ERcb+ajuaTpYpgg(3E2`dve?;-qa;lx&SQKg8L-+H9(9%| zGPH-4y?Ri;JPn$ka=lD)2>k$gGyyD(&3< z=0EZHq}B^o3Cf0eZD@jOqM0!!n=qzn3B&z;OtwKjW^x-Zp}MW?C?kNWk+!A)3=m9s z6l#e4-4wiBmBR(3Iy4yaBcFuI-%{UF11P63y`>_Q0Kh=NVGZhl;1>afrnW;KEnox# z0z0I@G$0m%OWwxK>I=g2+#7pADE~@-*l}eDtKUOhjsksH&Qdi+D6xYmwD=q?)}U;Z z;=>x5@G+r7sN@lz>!Xe6YL^(x(6M-=V%NTb?cKkx9pg_U$ z1mwE@0hUAsfEHRo?3Mv%Dn8+S3zpts8*izw?l^(u&J>Wx+4};GY9p8|%@78`GTg)u zhCb9mko*y^$GrL;USJZ4S2%_v9lSEY8}M&S1IIotxfQ1HOHm5!JU9dRCQi8Oni^Vd zI-)i>0x(Q(5(sY_=r7wV1*rhB3g#x$z_euMDI^4w00_WTp#j5?4egk>pa5_6jeJOv z1Wod`YDIcuFpTug)=@lV7!@YAr*`$8B%hUl50pf>@Igo zGVS-N<~k)eDA}ZBzpnwd?Ay^fVhbsA3w|ZRECOHTVQPS2@Und5#Z7~pcE=-FHu$6Sc9!m}wu?^N5SP6{FGumPcEX#IC2NyvuYw=6v08>5|5v&j*Vr*ovRvrBy%!L(xmwf?57pu8R zh4Anz9Lz^-A0hn8J+;Ambzk0gIoK-`(Z!$HI~Z~;Dx`VM#jFGl%eZ37-23rbaW zN-+1s4vfiFk9-2s$+%?FmQIS-(E*?b^PHuMeKY$k=^GfO!Ly$tVIszximjTvn~&61 zy*hcUb;NdJJ+O3^1Iqrh$RzE=r_qMt4O^Fc&z@XHp1nB&Mt?^+G&y%(0Q}RNAd1x1 zyQ+k>~1mx!EI_gD+Sv8MgM z)^dX@PHC**6Sn$NOPr&Iq$-@5J6Z2L7R{JEIw>HU*_y2bb|Fp$-GYA;F&yX?lvAw*suKRmG_d>kM>OVnBvFw>s0k~#y&_n(EEwE(UeR-pY&y0GaTM>ruBz~QGTI4sYh*GUyY)d2^<34vP)(Jd=b zE_CeSFp`RcBC^OG1SZIgQX5YLBBB;f~J8^8nYy0}FN?Y8(j z)v6m(1XEidr@>_1(;jLl35hZ^u(y9rp%>Q{e9MG(gbH2eaQffFV00f>!F z4spY3<355cIXI7y4|suQ8uM17m>u1{jWfn1@>rXu0v2Y&|Nl zs)yqV<5J+!r?|wNC@JEmVG1`3xhB+pn1Lgifdi37KUV^)oq@#V$jsnwI}PKbji%8% zQD7GBTvNNMh`)3*qf(HkURbhHDbe&c7fszn8NJTSYsOTnE^yuP0YWz2aRbDHZNJFuo-1hc#==y_4Wjm%sR z_dsYzWB&>_J6J1bJs-|)8Kb#(l_h2s53D^-t{G)QAlSHte5?aTdr;1u?2!bGLd(Q1~f2gs`=@))%Q#dpjXKK=aj7e6O8?YGb^hg}@=)8{W=eEB6O zD}QURf=FuJ9?`r>2`N`rc8+o3`SUMD_Gjyzq4&HHeZl^bMBZ!+2@FK$aDezPF5B@^ zotLWm0`);s;wj2(VR4U=1{ zIUJiTv-H-T{2t;U_`uvJSH>yq^9+3_3nK#*A^o5)5k=OA>xk`#0rH*lB#Mk7sV^uD z!z_hheUI{dDEK^l9Isy?_P&Ko0f9(zYZ!oN1T!pLm(f4MlMFDp44|VxCA&q?5Zs^` zwDu;I(Y=C(%QkX>as*n`0xsUEJ^fQjj)R*fpbSA5I7zB;yQHIBGjAJ)p!x88_n9v; zi3CsBOFDv-!VBcKy7>CQRlf>DMv-XTHE3HUXHo!fF zX)@8mSXV?Nv=L9H?-v>{TPb>j0#2b8J{J^v0XYltNO%B%9~pQ``nJvL6tPyQ(`;C9 zjRh!DQl=cNFt%+(v-F!vIE5H1duB-;4kJ^af$<#oLmzQjVJ@KoK(fOu_hchhN;CUn zJhL3l41MIO{YDWvWjh~xL$~D32zi8S!NeJ23-)1c6vf)Pu(_&DqqUEfcSO7O~x z@b;#|691NZhHJ+)MJ_2~e`oNNX@9ZLvv=(AKY{)pdz|d;H|d*@cz2UL%*{P_)}_$T z{N%2^@#JVBTmTrD1@xHSQCjL~F`T1yp}!;H;&us~FTJB66fqYEB4&1!aQa}0G=t&% zb~#+6okcxvoWO%wzImG6`0sEAb@NmQ0D!!UJ3oL-aDWUXL063U9Cl(3yK)GlW&v>s zww}R093CAB5AU1h(WK|`Y_Ek!li7wx=I=7&J4@qPAHvLz(rmt|Zd?k>csnH5AZ3|l z;UB}9u*~<5S?LSkVI1i6oWkQTAoxMN`P9~hE7+Rgj#wFhHx zF=6swKu^_K@dN6R-3nI!>p(WcInLh$!nh=+1m8Az0&z;Z?Ua}G z&V%=$iFp%Kl{>IcWVXoYCB5xg)JUR(3m4+}#f!yHxPt>k6F`kck)`liQ2q-E{x6X+_@bXotgRLiBOpU=lQG8^ zAu!}Hz%bY3LgJw~14kK<=1`6Z1e?QD8X^yL@ne}3<3yjgEgfw%OEYw)niwYs zmlW4U=yPubQ!C;tJe1tF5!NmPIA9*J^O{4n!$fb?56YR!qWA{-q=%ecil|QJwfBwP zf=S_Ue8NEMmgnSoe~nw0>?n4>Xf=#4&JqfgZ4sQuxj1i{Y_ zsMooa{TuD$PzvuSqtBrxN8?e=;iPw(CHL^#lXS;%J!9{pUojW2jrujwT1ypv^S?;g BU`YS~ literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ec79664686c572495727d48edc896e21e373054 GIT binary patch literal 5061 zcmdT|TW=f372ch_@FJ3;WZJTv#MvZn61Is{H|R}G(>RIkB(N1DPW-TFyI66C)Kbe` zW_D;t0$reyi#+wQ=u1#oj{*93iURxEm-+|v#cr*BXO7AU0@Dp=YGzd^PO** z*6URR&kKM4d+)C+hVc)2Ir>!a@-9mDPY7;sGd2dMZw{ExOnPUrvminEcgq9nqSlJPFx=}{D!W(@#3KAH+8)fFAbLcWz@_5DRHW6@``Bj zD$0UrbzNTL^?Qzgnm5q0s9Tw6ts3ZQ@}+yWe}*qldQKhbE%O#Xjoy{320z1B?iqYV zSbe8|cHi<>(RP-vqHR^I;{6z3(xMqlggL}S+pmxsLP|<5vQ8>^wqFt zAO#@3RNhC) zS`cGnY_8yb&*JwSO3Q#9*xsu*Lhh+;;puffI;y8q-O`J0i^RJf#v`%zz{C>YVuD;i zMb=0xPQ}CDu8%Ug9^+*3dK_)74|mmen!L4k@rUbKq{N%Uu(K2PM7DlAjtA?74aJwD zGg2w0g}XwcOY1eWtQc$w&ZDHaHr&k{r087nQ`k5oXGKNXo)h|T}7heY5X(gUp zK{hcUZx_eJwmlJaWjY9+b+%Ou&s>NyE1P9go~G$by*S+pX;dHBC z8$gLNuiUHq6(B_wNKxu9bZw0lwqN57zIe~@>t7qT;WyyZ&HT(Q@+Ask3ew9{PYVAA zC!W1|Q&)q>b1zEfK`@F#x$E(0fRKkftyQZYo=ZF&DvYE%MTE%C_F9sx&6az^OHzf< z*$$O=M_}~@qzTe0xAcJ)D$5l$_=wCm|;nH|=< zdgJqKdLr4&HuQ3IKFkPaL6dD1@jTyoZ8mT=D1eF^MZ{yD?TB-Pd!|Aq=7Y&dgDl`` zcPKVIl5{4*3*AEqiQ~}z^rW&EoIFP=qh17nJ$WGS^*a$zVx+ul>8F?4^F?4fihB)l z8)Qg&D~{Ps<%w%5rfuep5wNj6T+lU3+4yz#0VIJ#a1yv<H&#Th?@&Uj_RrV33K~*h`jlQF5^wzKL zTfe5!b(99)snmhfMzE-ws?=Y?`yvXhX8zIIr(KuTDYPte%<@40{0*UhwFjOT7I&qj~&{|_4 zfw=r0#E((3pFvbz*Q%O?aqgk3p}K4`t4uAAtB1C2qn)8$qu^tVv4MtW?eX4=Plutk zshM*}du3kN7r`gV{t1jH6*b0xh|1XP>lD=o!mpC=upO4wKQ_KH$=~`Giv0(+O6d$p zJ7#^8o2v%3^qu?em>rlCFJYV7Ioc)pGM0r4Y=^k0S5cR365pp*n};g&olZL34Ujn2 zsN*7uACaJmC(@rb`z^F*#1LGwg~Ud7JJ0qmJi{8tobgzuCa)nk(aI;NKsgYY9a}y} z;np{IbO6Ex&U;$n`_=)C0mZqC^nw(~-Zl~#0%2$0ckqr7I$(G0**X`wHMYkL^cE)S zVzera?S1pEGj?-`uIh{C7L~dU3D>KlMZ_*gbj%`<)Eq9}2 zI8uRMEq(`Kw&T}G3UYLG`HR3p8bVG%v6{O@!wD1TC7|G)Q1ahk^^9_E)g%UCA2!HM zS%Xzs6D_v6cm5eBpDfT`q>xX*bAuqS2E=VgG1Y59@Wm*MCq1Pg;AscV=RUZ6<tHI5W{L8r&B}x*llCP2QNKh2%0QKF^G<(+GA%BkMygmuL zjsWMKgr1JSb(*AukoXZTmR*1-8wjr^BckyL5&_2^mo4@m1lzxv(|FW+sacXVql0qn zsZZWAa?2A0gvf^tDpAl6hlmGAM@f|X70{0|YD7aSpdmF)LqI-2L#mp382*B$95r}K zBX8W8+}gG4ABvEN{{r$(n>4|=cHP@X$0T)sqOaPE(_W-9@C9P|-XK&m>WIu6WphSy zGaL+KA=kXmkaH&k!-RWVQJCqxTYO4nUI;Q#pfU0HIl(}#-qoGY<`{aN|1HlTbvC>^ z+XBZ$j!(%3TF0Zj;QxV&T-lswo7^U~PUM^pQSs?Rbt)E$cF=-b0N%hV#?IJ1aKJJw z`HEu7>JxA|V;dZRX%655t^jG81C)<)fU$AE!fkFGOgVwGDPP1E8p`BLRC^DiU6mAD z^FS?v_YG?M!F13(o%+jC^r8%uPh(W^d0Z;80x^an!_vYhRF*oZkEhq)V^~JnxK^@( zT{dGnRoI6}pjMGUo6g?KG=_?>nx)QTd4gbvip~@CZ^!RUg*>ruer2}V=H7ZS_qA!9 zWnKo_86$+mu}I4%k%n*Y+NYyLf&X8YGQ~dtg^EAL$|tZ`W7(h3o>7t}4ff8QIPnx~ z>vDtU|ANHZBz{Ta9TLBSfUBfw{8R)byp->f&=%A}8&sp@C3Jj&l2KODD7j212@5`) z;v@r3mVuM5!yYlW@vzyFG^_nw?t-0xTo>Dt4{8VSPt87Wx}&5s`G15ZiC>++%b#Gl zzdRc`qz?u994HX~B7Nzv9skl_qcNI=j#Lz9^H=@Np&6lOivH{0VX|IAf7beMeTU(l zbTAktQ76=Y5Trw$oBh?N%`8^dByx5=n*8eTXU$%7_qxuNI^$?#9=V&DpXcSbi silu(x[:d]) * x[d:] where d = x.shape[-1] // 2. + + Shapes: + x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) + return: (batch_size, seq_len, d) or (num_tokens, d) + """ + + def _forward(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.silu(x[..., :d]) * x[..., d:] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.silu_and_mul(out, x) + return out + + +class GeluAndMul(nn.Module): + """An activation function for GeGLU. + + The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2. + + Shapes: + x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) + return: (batch_size, seq_len, d) or (num_tokens, d) + """ + + def _forward(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.gelu(x[..., :d]) * x[..., d:] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.gelu_and_mul(out, x) + return out + + +class NewGELU(nn.Module): + + def _forward(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + c = math.sqrt(2.0 / math.pi) + return 0.5 * x * (1.0 + torch.tanh(c * + (x + 0.044715 * torch.pow(x, 3.0)))) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + ops.gelu_new(out, x) + return out + + +class FastGELU(nn.Module): + + def _forward(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * + (1.0 + 0.044715 * x * x))) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + ops.gelu_fast(out, x) + return out + + +class ScaledActivation(nn.Module): + """An activation function with post-scale parameters. + + This is used for some quantization methods like AWQ. + """ + + def __init__( + self, + act_module: nn.Module, + intermediate_size: int, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, + ): + super().__init__() + self.act = act_module + self.input_is_parallel = input_is_parallel + if input_is_parallel: + tp_size = get_tensor_model_parallel_world_size() + intermediate_size_per_partition = divide(intermediate_size, + tp_size) + else: + intermediate_size_per_partition = intermediate_size + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.scales = nn.Parameter( + torch.empty(intermediate_size_per_partition, dtype=params_dtype)) + set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.act(x) / self.scales + + def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): + param_data = param.data + if self.input_is_parallel: + tp_rank = get_tensor_model_parallel_rank() + shard_size = param_data.shape[0] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +_ACTIVATION_REGISTRY = { + "gelu": nn.GELU(), + "gelu_fast": FastGELU(), + "gelu_new": NewGELU(), + "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), + "relu": nn.ReLU(), +} + + +def get_act_fn( + act_fn_name: str, + quant_config: Optional[QuantizationConfig] = None, + intermediate_size: Optional[int] = None, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, +) -> nn.Module: + """Get an activation function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_REGISTRY: + raise ValueError( + f"Activation function {act_fn_name!r} is not supported.") + + act_fn = _ACTIVATION_REGISTRY[act_fn_name] + if (quant_config is not None + and act_fn_name in quant_config.get_scaled_act_names()): + if intermediate_size is None: + raise ValueError("intermediate_size must be specified for scaled " + "activation functions.") + return ScaledActivation(act_fn, intermediate_size, input_is_parallel, + params_dtype) + return act_fn + + +# ↓ add for smoothquant +class DequantSiluAndMulQuant(nn.Module): + """An activation function for SwiGLU. + The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2. + Shapes: + x: (num_tokens, 2 * d) + return: (num_tokens, d) + """ + + # TODO(Zhang Ying): use_per_token_quant + def __init__(self, + gate_dequant_scale: float = 1.0, + up_dequant_scale: float = 1.0, + quant_scale: float = 1.0, + use_per_token_quant: bool = True) -> None: + super().__init__() + self.register_parameter( + "gate_dequant_scale", + torch.nn.Parameter( + torch.tensor(gate_dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + self.register_parameter( + "up_dequant_scale", + torch.nn.Parameter( + torch.tensor(up_dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + self.register_parameter( + "quant_scale", + torch.nn.Parameter( + torch.tensor(quant_scale, dtype=torch.float32,requires_grad=False)) + ) + self.use_per_token_quant = use_per_token_quant + + def _apply(self, fn): + super()._apply(fn) + self.gate_dequant_scale.data = self.gate_dequant_scale.cpu() + self.up_dequant_scale.data = self.up_dequant_scale.cpu() + self.quant_scale.data = self.quant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.gate_dequant_scale.data = self.gate_dequant_scale.to(*args, **kwargs) + self.gate_dequant_scale.data = self.gate_dequant_scale.to(torch.float32) + self.up_dequant_scale.data = self.up_dequant_scale.to(*args, **kwargs) + self.up_dequant_scale.data = self.up_dequant_scale.to(torch.float32) + self.quant_scale.data = self.quant_scale.to(*args, **kwargs) + self.quant_scale.data = self.quant_scale.to(torch.float32) + return self + + def forward(self, x: torch.Tensor) -> torch.Tensor: + num_tokens = x.numel() // x.shape[-1] + d = x.shape[-1] // 2 + out = torch.empty(*x.shape[:-1], d, dtype=torch.int8, device=x.device) + if self.use_per_token_quant: + scale = torch.empty(num_tokens, + dtype=torch.float32, + device=x.device) + # tmp is used in kernel func + tmp = torch.empty(num_tokens, + d, + dtype=torch.float32, + device=x.device) + ops.dequant_silu_and_mul_quant( + out, x, self.gate_dequant_scale.item(), self.up_dequant_scale.item(), + scale, tmp) + return out, scale + else: + ops.dequant_silu_and_mul_quant( + out, x, self.gate_dequant_scale.item(), self.up_dequant_scale.item(), + self.quant_scale.item()) + return out + diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py new file mode 100644 index 0000000..5c6231b --- /dev/null +++ b/vllm/model_executor/layers/attention.py @@ -0,0 +1,542 @@ +"""Multi-head attention.""" +import os +enable_infer_paged_attn = os.getenv("ENABLE_INFER_PAGED_ATTN",None) +from typing import List, Optional + +import importlib +import torch +import torch.nn as nn +from ixformer.contrib.xformers import ops as xops +from ixformer.contrib.xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, + LowerTriangularMaskWithTensorBias) + +from vllm._C import ops +from vllm._C import cache_ops +from vllm.model_executor.input_metadata import InputMetadata +## from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( +## context_attention_fwd) +from vllm.utils import is_hip + +# _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] +# # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +# _PARTITION_SIZE = 512 +_SUPPORTED_HEAD_SIZES = [64, 128, 256] +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 256 + + +class PagedAttention(nn.Module): + """MHA/MQA/GQA layer with PagedAttention. + + This class takes query, key, and value tensors as input. The input tensors + can either contain prompt tokens or generation tokens. + The class does the following: + + 1. Reshape and store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention using either + xformers or the PagedAttention custom op. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + super().__init__() + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + if self.head_size not in _SUPPORTED_HEAD_SIZES: + raise ValueError(f"head_size ({self.head_size}) is not supported. " + f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.") + + self.use_ref_attention = self.check_use_ref_attention() + + # TODO align vllm do not need those + self.attn_op = xops.fmha.flash.FwOp() + head_mapping = torch.repeat_interleave( + torch.arange(self.num_kv_heads, dtype=torch.int32), + self.num_queries_per_kv) + self.register_buffer("head_mapping", head_mapping, persistent=False) + + def check_use_ref_attention(self) -> bool: + if not is_hip(): + return False + # For ROCm, check whether flash attention is installed or not. + # if not, use_ref_attention needs to be True + return importlib.util.find_spec("flash_attn") is None + + def ref_masked_attention( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + ) -> torch.Tensor: + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + seq_len, _, _ = query.shape + attn_mask = torch.triu(torch.ones(seq_len, + seq_len, + dtype=query.dtype, + device=query.device), + diagonal=1) + attn_mask = attn_mask * torch.finfo(query.dtype).min + + attn_weights = self.scale * torch.einsum("qhd,khd->hqk", query, + key).float() + attn_weights = attn_weights + attn_mask.float() + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.einsum("hqk,khd->qhd", attn_weights, value) + return out + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + """PagedAttention forward pass. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + key_cache: shape = [num_blocks, num_kv_heads, head_size/x, + block_size, x] + value_cache: shape = [num_blocks, num_kv_heads, head_size, + block_size] + input_metadata: metadata for the inputs. + cache_event: event to wait for the cache operations to finish. + Returns: + shape = [batch_size, seq_len, num_heads * head_size] + """ + num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + slot_mapping = input_metadata.slot_mapping + + # Reshape the keys and values and store them in the cache. + # If key_cache and value_cache are not provided, the new key and value + # vectors will not be cached. This happens during the initial memory + # profiling run. + if key_cache is not None and value_cache is not None: + cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping, + ) + + if input_metadata.is_prompt: + # normal attention + if (key_cache is None or value_cache is None + or input_metadata.block_tables.numel() == 0): + if input_metadata.attn_bias is None: + if self.alibi_slopes is None: + attn_bias = BlockDiagonalCausalMask.from_seqlens(input_metadata.prompt_lens) + if self.sliding_window is not None: + attn_bias = attn_bias.make_local_attention( + self.sliding_window) + input_metadata.attn_bias = attn_bias + else: + attn_bias = BlockDiagonalCausalMask.from_seqlens(input_metadata.prompt_lens) + input_metadata.attn_bias = attn_bias + + if self.use_ref_attention: + output = self.ref_masked_attention( + query, + key, + value, + ) + # Using view got RuntimeError: view size is not compatible with input tensor's size and stride + # (at least one dimension spans across two contiguous subspaces). Use reshape instead + return output.reshape(num_tokens, hidden_size) + + # TODO(woosuk): Too many view operations. Let's try to reduce + # them in the future for code readability. + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) + + out = xops.memory_efficient_attention_forward( + query, + key, + value, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + op=self.attn_op, + alibi_slopes=self.alibi_slopes + ) + output = out.view_as(query) + else: + # prefix-enabled attention + output = torch.empty_like(query) + context_attention_fwd( + query, + key, + value, + output, + key_cache, + value_cache, + input_metadata.block_tables, # [BS, max_block_per_request] + input_metadata.start_loc, + input_metadata.prompt_lens, + input_metadata.context_lens, + input_metadata.max_seq_len, + getattr(self, "alibi_slopes", None), + ) + else: + # Decoding run. + output = _paged_attention( + query, + key_cache, + value_cache, + input_metadata, + self.head_mapping, # self.num_kv_heads + self.scale, + self.alibi_slopes, + ) + + # Reshape the output tensor. + return output.view(num_tokens, hidden_size) + # TODO align + """ + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + PagedAttention forward pass. + + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + key_cache: shape = [num_blocks, num_kv_heads, head_size/x, + block_size, x] + value_cache: shape = [num_blocks, num_kv_heads, head_size, + block_size] + input_metadata: metadata for the inputs. + Returns: + shape = [batch_size, seq_len, num_heads * head_size] + + batch_size, seq_len, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + # Reshape the keys and values and store them in the cache. + # If key_cache and value_cache are not provided, the new key and value + # vectors will not be cached. This happens during the initial memory + # profiling run. + if key_cache is not None and value_cache is not None: + cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + input_metadata.slot_mapping.flatten(), + input_metadata.kv_cache_dtype, + ) + + if input_metadata.is_prompt: + # normal attention + if (key_cache is None or value_cache is None + or input_metadata.block_tables.numel() == 0): + if self.num_kv_heads != self.num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # TODO(woosuk): Use MQA/GQA kernels for higher performance. + query = query.view(query.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + query.shape[-1]) + key = key[:, :, + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, + None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) + + # Set attention bias if not provided. This typically happens at + # the very attention layer of every iteration. + # FIXME(woosuk): This is a hack. + if input_metadata.attn_bias is None: + if self.alibi_slopes is None: + attn_bias = BlockDiagonalCausalMask.from_seqlens( + [seq_len] * batch_size) + if self.sliding_window is not None: + attn_bias = attn_bias.make_local_attention( + self.sliding_window) + input_metadata.attn_bias = attn_bias + else: + input_metadata.attn_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, batch_size, + seq_len, query.dtype) + + if self.use_ref_attention: + output = self.ref_masked_attention( + query, + key, + value, + ) + # Using view got RuntimeError: view size is not compatible with input tensor's size and stride + # (at least one dimension spans across two contiguous subspaces). Use reshape instead + return output.reshape(batch_size, seq_len, hidden_size) + + # TODO(woosuk): Too many view operations. Let's try to reduce + # them in the future for code readability. + if self.alibi_slopes is None: + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) + else: + query = query.unflatten(0, (batch_size, seq_len)) + key = key.unflatten(0, (batch_size, seq_len)) + value = value.unflatten(0, (batch_size, seq_len)) + + out = xops.memory_efficient_attention_forward( + query, + key, + value, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (is_hip()) else None, + ) + output = out.view_as(query) + else: + # prefix-enabled attention + output = torch.empty_like(query) + context_attention_fwd( + query, + key, + value, + output, + key_cache, + value_cache, + input_metadata.block_tables, # [BS, max_block_per_request] + input_metadata.start_loc, + input_metadata.prompt_lens, + input_metadata.context_lens, + input_metadata.max_seq_len, + getattr(self, "alibi_slopes", None), + ) + + else: + # Decoding run. + output = _paged_attention( + query, + key_cache, + value_cache, + input_metadata, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + ) + + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) + """ + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + batch_size: int, + seq_len: int, + dtype: torch.dtype, +) -> LowerTriangularMaskWithTensorBias: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(prompt_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + # When using custom attention bias, xformers requires the bias to + # be sliced from a tensor whose length is a multiple of 8. + padded_len = (seq_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + batch_size, + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + attn_bias = LowerTriangularMaskWithTensorBias(bias) + return attn_bias + + +def _paged_attention( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + head_mapping: torch.Tensor, # num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + use_sqrt_alibi: bool = False +) -> torch.Tensor: + output = torch.empty_like(query) + + use_v2 = enable_infer_paged_attn is None and key_cache.dim() == 4 + if not use_v2: + block_size = value_cache.shape[3] + # Run PagedAttention V1. + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + head_mapping, # num_kv_heads + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + input_metadata.kv_cache_dtype, + ) + else: + # Run PagedAttention V2. + block_size = value_cache.shape[2] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = ( + (input_metadata.max_context_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + head_mapping, # num_kv_heads + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + input_metadata.kv_cache_dtype, + ) + return output + + +# ↓ add for smoothquant +class DequantPagedAttention(PagedAttention): + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + quant_kv_cache: bool = False, + kv_quant_params: torch.Tensor = None, + quant_scale: float = 1.0, + use_per_token_quant: bool = True, + ) -> None: + super().__init__(num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window) + self.register_parameter( + "quant_scale", + torch.nn.Parameter( + torch.tensor(quant_scale, dtype=torch.float32,requires_grad=False)) + ) + self.use_per_token_quant = use_per_token_quant + + def _apply(self, fn): + super()._apply(fn) + self.quant_scale.data = self.quant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.quant_scale.data = self.quant_scale.to(*args, **kwargs) + self.quant_scale.data = self.quant_scale.to(torch.float32) + return self + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + out = super().forward( + query, + key, + value, + key_cache, + value_cache, + input_metadata, + ) + quant_out = torch.empty_like(out, dtype=torch.int8) + if self.use_per_token_quant: + scale = torch.empty(out.numel() // out.shape[-1], + dtype=torch.float32, + device=out.device) + ops.quant(quant_out, out, scale) + return quant_out, scale + else: + ops.quant(quant_out, out, self.quant_scale.item()) + return (quant_out, ) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py new file mode 100644 index 0000000..6c4fd98 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -0,0 +1,5 @@ +from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe + +__all__ = [ + "fused_moe", +] \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c251771803e7efd3b97842f44a76656767a3130 GIT binary patch literal 285 zcmY+7u};H442JKLiWWrIZhe3ZoI$Jz@dPX^RTk@-GaO67jlgcJr*{Mtq z=g(avhqb3L-aTe%Zr+${Q+r2@^iDcuA_8KysaizS^uz2<+#S=s7FawJ Kxl~Cy``H7OgioIU literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15dabadeddc6e7c025742286a672314f4c8e0be6 GIT binary patch literal 10707 zcma)CO>7)TcJ9CF`Qh+?M3K~ATaqn}M3Exp_3kRRWBpKGtyYv_%W<}4Wz?MRA)B0; z9(VVMBGHq>N`ajK+62fUhwKIEO42XsrT;gLm-lc5e<4efEt!%n z+e%$FWxgw>!gtkFaaZeFBV%S7x~Vs^X10+tbFyfkH}l-CU>5j3)F_(6{7tKmG)B!) ze$Ld#8sp|TKkN00#-uqZOU}4cdMulVoWjPGoppvc4%;~{^IR6V9OAOb7^4M9ln-%Rau5O>QPcxTB z{`I_KU$oy~s%N-7%Vn9%bLf2@W6j~Z;1HE$G#Zau)F$kCd*MmOe9cxqlFExa<2TxY zV;i@a_yghTNtb zR_fl`n&abDcZ15h+lq@ zP})_Zd^EAAbY}TE8%^T*NOUMVQj$7HqnYUFtkgLc9g7a({dlNGv(fQsDLV96>70m; z)AuPUIx!_3mO4iIh90khnilDMa_3}ZL=z<`I=Pwg-?P<^B=L^#8C#~2(eo6y)}m8~ zCG;gez_(1IxI!mDXH^DQd0t_XCf&&ZL7BWdy1`+;zWmMrDzD_ zouOK)=h|Wk9Ac!iJW?hyc%*V9N9AakzKQl-DkGrq#sG!Ugu*$gz zQD4iX-F$Q&W9Fj^(fpqBZK-n}W0fMz6wha)b9lxq@OUq`SB*|%wy(p4 z&{zx6`Dme^Kdkck==^RWn#M|;e{}^q7b7Ja8yA<59Vyh^qHSl?Jm0X4+wP;V z?K_57GnOB<93Rck-oCS3Hrj#PTr)y%(`g!3(>5IbYBa3ScdJe?U*J)0IorkumTxtj z(D8%ITzA2^a@n|xG2JHimJvG507Lms%Xb3s(BLVjfN^&3+DV&tZD4P4L;3-|cjW&U=7tnCE8jMFX z6pf+fzDJaMIuXE5C2R`;`#_dbSvEE6ksGW4YGiCe0&A${Us;K@h81ka84oH1EaH!& zzFdrzux=JWe+{M&suOEfOg2`r^08L;EZfv!6rql0X2$_@akhrq@X}J8;p$kmy%0pa zI1_}P@02f@8LQF?eKWHvZdJaG(Z;R_;}mdBY5#I??MQD0`F$4KSZp)&!d|b?(qQLd zgjimQ z{ty*KDu$^Tp<)z8thQX+%o7`UsGxWH*c8lBt=p`u`Chwa=J*C_n?p6%r%7?2qNons z9S0OLqLOdTc|L)AyKgv<$|1hd0HvS{_C{jIo4>{{WnrH^&?Xy0tGQ!0InF39Xh zD~Y_4Nq@*{(FK|N4PoER4+axrT~b$$&FcB=9kCiP?k>=9owgnj%7Bl!ki6|f2uugd zmjIS7un;^Gaslq>N{NU>9fjx<@R|$|u3Mo1Gsy&+jzev|)QbX=K}=}m;(yn-LwDiW`$jc*H+&$VWfuu> zN4xD4%R*NU)Nbe|^stNGf$nKn2{2O)>)XkkmLFLZ+=yu7efJ?ElvMqVcZ_@YmgbC0 zbHX&w!)K<{1G{%M6p>OcW^q_MyOsp<|!4S(SS7zLd~TwdP9D&n%Vz+-^Cf z6UL%w+EX_{HSKwWyycuxgJ%Ms>d6X=o62g=NF>D%B&W3I(*2ao-E|5DTP>##>Lt^^ zw+JcsuKPVsSiUKRE3CySxZDRJc{biEMFCI&(JoYF`EDDqP3Hy4q3>Zn17V4`H>-Im&PO)%7^zG+N=(3^Iw1bm^% z`qyI`-(QTVx~O?kR8jM?pm2{^z7oq_B-=P2)K*D$rP-0UF^52(!B!3~JMoDTWsOEF&bGe3$ znXNs^b%sJY8banw`hoz0`h^_nQT7XIL$T!<)Zw1NUB;dAYuc^~s3Hl9e8*T{B=3CB zB=4w?6u}GMQD2)thAdC8Cd_Q;n~HCS*$us^pv>9&Pt>UJGx^6dazGM5Np{6Qhxu@5 zQzh+4fAmc2J&7xrM}agLCMg(>Fx44p56L;y5%b7RqfVo`5^xm=XI7Y|!1X)?f%~!= z>i{~T*YrzxML{z9K4GM(fb+lvS7d;j4H}`Qx|azLr(uU(8yn)f;NeN0Y#i! z<$!GQEuW;3glCIym5V+leg)6pqNlg0;Eljj=<1pO9s0nL%y;n=YikIc#0=l2FFeh| z|IbYR3N3=mC?>KK@`$3VgqVB(1Qyp5OfKGXvhttGq#{qr+J7ihg@5<2(x~#Xe-!sB z$szqA0Vt8CU&9srISK;71eAB_phQ9TR|&O^%&yL5mdjk2-y_|JW=GPWBBL6~DD_A| znf+;YkDxU5DMad??0*;zMe1Yu507=Uu(&&H=OPvD@;}8l=wQJfdLo4*yQ7h|C-08! zN!O*v@+Xoa*+os#q{qlrek3hQ!R;-yrhh-gWf!4f!&U$tWwR!*p5ftypaDQ^Q#t@! zxSiNOHjiXnMOw4(ph2W1$smg;!a#i6CV*Y9b2bqMm!S}+8wk!ctQKVv4HpTMH3V4=RhO_U(;VQ_sIG%3IaSc2oz=&Za3arT zBbF<1R&)z?l;wAJ)VY;+c82GKN2xTCIeBL%Grtjd&7GN#IX65w2&cAoMtNmuJvbbl zA22{+9HlOOGmf(W zbakuh#0qja8ltuJSljTBRuzzS6S(J$$|q53jpO2{kLof`?8JD_e`Wmg3EGU`2ad`=e$MynK{r$0KpB_B&`9{eqq1l|k&6;4GJcX-*6@5AXlmVbmm z9(Ck!WZdC$@a-tRW%wJlPrso*zMaE2UG&?P_oQG6?^%AA`Ta89bNsIG`&GQ>Mf(jU zRN+O{jSMcmraqK{JLK7kgHM&;mpVD(;ZwDf?-V+@F#A~YKSN7cIJ-bS?IENUhVYb+ zh9V8NuJ}}i{uVpK%_UgZ;d-Gl)EPlCLER_<7tlzss2juf$WyIT?5iE=)(#(7I~tJ% zEk?u9D3A%P=$F;^L#h~oS5=+#a_lp5*Mwr*QY^F@fV}96;Ms_TissoiXs7-<17d*wd&h z!5?sK!r?IJ(n=I+lwujirPwo&$Hd@woBans zbPqH|uTfi#xP7GKZ0{5BG#-^UW^hSA#_Rxjjy^?dXMa7`nU1!PS?@xj*~AV8*2y_C zJjuZ)Y93D1*dnIqYFnNWI7rmPg(RpJ@y{a1iCc(d*sgOvM(jtU;Z)bHrW-Vb#by>> z%>oVVY2z#h0rPkH0In?Z3j=3z@Cf8QdghwRULZye>dRSIVw<>50@sB3G!oicIs|VtqR|kQcHRG<|cIX_cYZ6(A7zNq0L>%O6gLZ>%iz&6lY`#h}e^7DnIvnCXVBk3ee{OOAKKM>Q8EL03 zgi5f9u;B^75+krS?900KtCSFDvMPwE5tSv0Dr5=j+nmpMRlPWf*qK?`pMh#Xx#>_> zL@)2cZ|s!5$3BWNn2~`(` z)4<8FF*2P)sn#m}A&mU!OBA<-*Qb!ee+R`)5MY#3ly;7Eqoh83lY~cL=_Z;#dyfi( z3i8lp?VyvHXA~QOYf27?IEjvLZY1--ALg{f@+^l^<}m&orzg-}P~{({zVA_iKj)Hs z8qCZlyU~9@Z$or&G0&*%nyHJsPobz7_ae0sp?=aW&TyO-tJqBpTzm@JV+~j^;1mJJ zL7c`gi-B8jS713SNCCv!L)Y0VPy3$|_4`!(4iyzD9#COXu}Vdiie)OUQ9%)vIZ7!6 zXmEP?)(Zs9A;Mc?*97JvPJAS17dWc|v~-=or1I-5#4nvIKa$zSN}Qcg1<&Fb%X-xqsmbn<1?)FO^%bOqmVG&7|Z zlV?i8=o-}yK53|1f%u$h;n6Kj~wvB_cIF~+g*l%WqI$jg{ zBa~C1bw5k}3;Z2=fU!xNV_bXy?}h;qn6L<22&UQ)_)%@L>5aU z-vJd4eG#?i>wa5kf>3FibswD~!v{~g}wIj;EXh)TQ){ZH^)Q&6vq@5`Kqq3#ZaUCDoDK93J!i&-cs^Lpf wUZDTiSo5!xjPk#96?Emfq8FZPXHLt?Nzne)fi0<3R{5WioboFzKl0!I2b+`qEC2ui literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py new file mode 100644 index 0000000..08e3c2d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -0,0 +1,377 @@ +"""Fused MoE kernel.""" +import functools +import json +import os +from typing import Any, Dict, Optional, Tuple + +import torch +import triton +import triton.language as tl + +from vllm._C import ops +from vllm.logger import init_logger +from vllm.utils import is_hip + +logger = init_logger(__name__) + + +@triton.jit +def fused_moe_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when moving by 1 + # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` + # by to get the element one row down (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated, + and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to. + - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A. + This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids` + by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak) + + off_experts = tl.load(expert_ids_ptr + pid_m) + b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + a = tl.load(a_ptrs, + mask=token_mask[:, None] & + (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + # We accumulate along the K dimension. + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def moe_align_block_size( + topk_ids: torch.Tensor, block_size: int, + num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns the token distribution across experts to be compatible with block size for matrix multiplication. + + Parameters: + - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token. + - block_size: The block size used in block matrix multiplication. + - num_experts: The total number of experts. + + Returns: + - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert. + - expert_ids: A tensor indicating the assigned expert index for each block. + - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size. + + This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions align correctly. + + Example: + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens. + - As block_size is 4, we pad 1 token for each expert. + - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. + - Then append padding tokens [12, 12, 12, 12] for each block. + - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. + """ + sorted_ids = torch.empty( + (topk_ids.numel() + num_experts * (block_size - 1), ), + dtype=torch.int32, + device=topk_ids.device) + expert_ids = torch.empty((topk_ids.numel() + num_experts, ), + dtype=torch.int32, + device=topk_ids.device) + sorted_ids.fill_(topk_ids.numel()) + num_tokens_post_pad = torch.empty((1), + dtype=torch.int32, + device=topk_ids.device) + ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, + expert_ids, num_tokens_post_pad) + return sorted_ids, expert_ids, num_tokens_post_pad + + +def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, top_k: int, + config: Dict[str, Any]) -> None: + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ + 'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), ) + + fused_moe_kernel[grid]( + A, + B, + C, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + B.shape[2], + sorted_token_ids.shape[0], + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=tl.bfloat16 if A.dtype == torch.bfloat16 else tl.float16, + **config, + ) + + +@functools.lru_cache +def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the fused MoE kernel. + + The return value will be a dictionary that maps an irregular grid of batch sizes + to configurations of the fused_moe kernel. To evaluate the kernel on a given batch + size bs, the closest batch size in the grid should be picked and the associated + configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs directory + device_name = torch.cuda.get_device_name().replace(" ", "_") + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "configs", + f"E={E},N={N},device_name={device_name}.json") + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info( + f"Using configuration from {config_file_path} for MoE layer.") + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default configuration + return None + + +def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + inplace: bool = False, + override_config: Optional[Dict[str, Any]] = None, +) -> torch.Tensor: + """ + This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - gating_output (torch.Tensor): The output of the gating operation (before softmax). + - topk (int): The number of top-k experts to select. + - renormalize (bool): If True, renormalize the top-k weights to sum to 1. + - inplace (bool): If True, perform the operation in-place. Defaults to False. + - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + # Check constraints. + assert hidden_states.shape[0] == gating_output.shape[0], ( + "Number of tokens mismatch") + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.is_contiguous(), "Expert weights1 must be contiguous" + assert w2.is_contiguous(), "Expert weights2 must be contiguous" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16 + ] + M, _ = hidden_states.shape + E, N, _ = w1.shape + + if is_hip(): + # The MoE kernels are not yet supported on ROCm. + routing_weights = torch.softmax(gating_output, + dim=-1, + dtype=torch.float32) + topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1) + else: + import vllm._moe_C as moe_kernels + + topk_weights = torch.empty(M, + topk, + dtype=torch.float32, + device=hidden_states.device) + topk_ids = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + token_expert_indicies = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + moe_kernels.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) + del token_expert_indicies # Not used. Will be used in the future. + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + if override_config: + config = override_config + else: + # First try to load optimal config from the file + configs = get_moe_configs(E, w2.shape[2]) + + if configs: + # If an optimal configuration map has been found, look up the optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Else use the default config + config = { + 'BLOCK_SIZE_M': 64, + 'BLOCK_SIZE_N': 64, + 'BLOCK_SIZE_K': 32, + 'GROUP_SIZE_M': 8 + } + + if M <= E: + config = { + 'BLOCK_SIZE_M': 16, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 64, + 'GROUP_SIZE_M': 1 + } + + intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N), + device=hidden_states.device, + dtype=hidden_states.dtype) + intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2), + device=hidden_states.device, + dtype=hidden_states.dtype) + intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]), + device=hidden_states.device, + dtype=hidden_states.dtype) + + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + topk_ids, config['BLOCK_SIZE_M'], E) + + invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1, + topk_weights, topk_ids, sorted_token_ids, + expert_ids, num_tokens_post_padded, False, + topk_ids.shape[1], config) + + ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + + invoke_fused_moe_kernel(intermediate_cache2, w2, intermediate_cache3, + topk_weights, topk_ids, sorted_token_ids, + expert_ids, num_tokens_post_padded, True, 1, + config) + + if inplace: + return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), + dim=1, + out=hidden_states) + return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), + dim=1) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py new file mode 100644 index 0000000..6ee0a5b --- /dev/null +++ b/vllm/model_executor/layers/layernorm.py @@ -0,0 +1,216 @@ +"""Custom normalization layers.""" +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn + +from vllm._C import ops + + +class RMSNorm(nn.Module): + """Root mean square normalization. + + Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. + Refer to https://arxiv.org/abs/1910.07467 + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + ) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def _forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """PyTorch-native implementation equivalent to forward().""" + orig_dtype = x.dtype + x = x.to(torch.float32) + if residual is not None: + x = x + residual.to(torch.float32) + residual = x.to(orig_dtype) + + variance = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + x = x.to(orig_dtype) * self.weight + if residual is None: + return x + else: + return x, residual + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + scale: float = 1.0, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if residual is not None: + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + scale, + ) + return x, residual + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out + + +# ↓ add for smoothquant +class RMSNormQuant(nn.Module): + """Root mean square normalization. + + Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. + Refer to https://arxiv.org/abs/1910.07467 + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + ) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward( + self, + x: torch.Tensor, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + out = torch.empty_like(x, dtype=torch.int8) + ops.rms_norm_quant( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out + + +class AddResidualRMSNormQuant(nn.Module): + """Root mean square normalization. + Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. + Refer to https://arxiv.org/abs/1910.07467 + """ + + def __init__(self, + hidden_size: int, + eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, + x: torch.Tensor, + residual: torch.Tensor, + scale: torch.Tensor = None) -> torch.Tensor: + out = torch.empty_like(x, dtype=torch.int8) + ops.fused_add_rms_norm_quant(out, x, residual, self.weight.data, self.variance_epsilon) + return out, residual + + +class DequantAddResidualRMSNormQuant(nn.Module): + """Root mean square normalization. + Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. + Refer to https://arxiv.org/abs/1910.07467 + """ + + # TODO(Zhang Ying): use_per_token_dequant + def __init__(self, + hidden_size: int, + dequant_scale: float = 1.0, + use_per_token_dequant: bool = True, + eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + self.register_parameter( + "dequant_scale", + torch.nn.Parameter(torch.tensor(dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + self.use_per_token_dequant = use_per_token_dequant + + def _apply(self, fn): + super()._apply(fn) + self.dequant_scale.data = self.dequant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(torch.float32) + return self + + def forward(self, + x: torch.Tensor, + residual: torch.Tensor, + scale: torch.Tensor = None) -> torch.Tensor: + out = torch.empty_like(x, dtype=torch.int8) + if self.use_per_token_dequant and scale is not None: + ops.dequant_fused_add_rms_norm_quant( + out, x, residual, self.weight.data,self.variance_epsilon, + scale, self.dequant_scale.item()) + else: + ops.dequant_fused_add_rms_norm_quant( + out, x, residual, self.weight.data, self.variance_epsilon, + None, self.dequant_scale.item()) + return out, residual + + +class DequantAddResidual(nn.Module): + def __init__(self, + dequant_scale: float = 1.0, + use_per_token_dequant: bool = True) -> None: + super().__init__() + self.register_parameter( + "dequant_scale", + torch.nn.Parameter(torch.tensor(dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + self.use_per_token_dequant = use_per_token_dequant + + def _apply(self, fn): + super()._apply(fn) + self.dequant_scale.data = self.dequant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(torch.float32) + return self + + def forward(self, + x: torch.Tensor, + residual: torch.Tensor, + scale: torch.Tensor = None) -> torch.Tensor: + out = torch.empty_like(residual) + if self.use_per_token_dequant and scale is not None: + ops.dequant_add_residual(out, x, residual, scale, self.dequant_scale.item()) + else: + ops.dequant_add_residual(out, x, residual, None, self.dequant_scale.item()) + return out + + +class AddResidual(DequantAddResidual): + def __init__(self, + dequant_scale: float = 1.0, + use_per_token_dequant: bool = True): + super().__init__(dequant_scale,use_per_token_dequant) + + def forward(self, + x: torch.Tensor, + residual: torch.Tensor, + scale: torch.Tensor = None) -> torch.Tensor: + return x + residual diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py new file mode 100644 index 0000000..915e04d --- /dev/null +++ b/vllm/model_executor/layers/linear.py @@ -0,0 +1,754 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + +import torch +import ixformer.functions as F +from torch.nn.parameter import Parameter + +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) +from vllm.model_executor.parallel_utils.utils import ( + divide, split_tensor_along_last_dim) +from vllm.model_executor.utils import set_weight_attrs +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def adjust_marlin_shard(param, shard_size, shard_offset): + marlin_tile_size = getattr(param, "marlin_tile_size", None) + if marlin_tile_size is None: + return shard_size, shard_offset + + return shard_size * marlin_tile_size, shard_offset * marlin_tile_size + + +class LinearMethodBase(ABC): + """Base class for different (maybe quantized) linear methods.""" + + @abstractmethod + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + """Create weights for a linear layer.""" + raise NotImplementedError + + @abstractmethod + def apply_weights(self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + """Apply the weights to the input tensor.""" + raise NotImplementedError + + +class UnquantizedLinearMethod(LinearMethodBase): + """Linear method without quantization. + + Args: + separate_bias_add: If true, add bias separately after matrix + multiplication. + """ + + def __init__(self, separate_bias_add: bool = True): + self.separate_bias_add = separate_bias_add + + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + weight = Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + return {"weight": weight} + + def apply_weights(self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + weight = weights["weight"] + if self.separate_bias_add: + if bias is not None: + return F.linear(x, weight) + bias + return F.linear(x, weight) + return F.linear(x, weight, bias) + + +class ReplicatedLinear(torch.nn.Module): + """Replicated linear layer. + + Args: + input_size: input dimension of the linear layer. + output_size: output dimension of the linear layer. + bias: If true, add bias. + skip_bias_add: If true, skip adding bias but instead return it. + params_dtype: Data type for the parameters. + linear_method: (Maybe quantized) linear method. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.skip_bias_add = skip_bias_add + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + if linear_method is None: + linear_method = UnquantizedLinearMethod() + self.linear_method = linear_method + self.linear_weights = self.linear_method.create_weights( + self.input_size, self.output_size, self.input_size, + self.output_size, self.params_dtype) + for name, weight in self.linear_weights.items(): + if isinstance(weight, torch.Tensor): + self.register_parameter(name, weight) + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=self.params_dtype)) + set_weight_attrs(self.bias, {"output_dim": 0}) + else: + self.register_parameter("bias", None) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + bias = self.bias if not self.skip_bias_add else None + output = self.linear_method.apply_weights(self.linear_weights, x, bias) + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + +class ColumnParallelLinear(torch.nn.Module): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Args: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. + gather_output: If true, call all-gather on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y_i = XA_i + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + linear_method: (Maybe quantized) linear method. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.gather_output = gather_output + # Divide the weight matrix along the last dimension. + tp_size = get_tensor_model_parallel_world_size() + self.output_size_per_partition = divide(output_size, tp_size) + self.skip_bias_add = skip_bias_add + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + if linear_method is None: + linear_method = UnquantizedLinearMethod() + self.linear_method = linear_method + self.linear_weights = self.linear_method.create_weights( + self.input_size, self.output_size_per_partition, self.input_size, + self.output_size, self.params_dtype) + for name, weight in self.linear_weights.items(): + if isinstance(weight, torch.Tensor): + self.register_parameter(name, weight) + set_weight_attrs(weight, {"weight_loader": self.weight_loader}) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + output_dim = getattr(param, "output_dim", None) + param_data = param.data + if output_dim is not None: + shard_size = param_data.shape[output_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def forward(self, input_): + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + output_parallel = self.linear_method.apply_weights( + self.linear_weights, input_, bias) + if self.gather_output: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + +class MergedColumnParallelLinear(ColumnParallelLinear): + """Packed linear layers with column parallelism. + + Similar to ColumnParallelLinear, but the weight matrix is concatenated + along the output dimension. When the weight matrix is loaded, the + different partitions are sharded separately. + + Args: + input_size: input dimension of the linear layer. + output_sizes: list of output dimensions of the linear layer. + bias: If true, add bias. + gather_output: If true, call all-gather on output and make the output + available to all GPUs, otherwise, every GPU will have + its own output. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + linear_method: (Maybe quantized) linear method. + """ + + def __init__( + self, + input_size: int, + output_sizes: List[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + linear_method: Optional[LinearMethodBase] = None, + ): + self.output_sizes = output_sizes + tp_size = get_tensor_model_parallel_world_size() + assert all(output_size % tp_size == 0 for output_size in output_sizes) + super().__init__(input_size, sum(output_sizes), bias, gather_output, + skip_bias_add, params_dtype, linear_method) + + def weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + param_data = param.data + output_dim = getattr(param, "output_dim", None) + if loaded_shard_id is None: + # Loaded weight is already packed. + if output_dim is None: + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + current_shard_offset = 0 + shard_offsets = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.pack_factor + shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + assert loaded_shard_id < len(self.output_sizes) + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + if output_dim is not None: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size + shard_size = self.output_sizes[loaded_shard_id] // tp_size + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.pack_factor + shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "MergedColumnParallelLinear, assume the weight is " + "the same for all partitions.") + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +class QKVParallelLinear(ColumnParallelLinear): + """Linear layers for the attention's QKV transformation. + + Linear layers for the linear transformation of the query, key, and value + vectors in the attention layer. The weight matrix is concatenated along + the output dimension. The layer is parallelized along the head dimension. + When the number of key/value heads is smaller than the number of query + heads (e.g., multi-query/grouped-query attention), the key/value head may + be replicated while the query heads are partitioned. + + Args: + hidden_size: input hidden state size of the transformer. + head_size: size of each attention head. + total_num_heads: total number of attention query heads. + total_num_kv_heads: total number of attention key/value heads. If + None, assume total_num_kv_heads = total_num_heads. + bias: If true, add bias. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + linear_method: (Maybe quantized) linear method. + """ + + def __init__( + self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: Optional[int] = None, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + linear_method: Optional[LinearMethodBase] = None, + ): + self.hidden_size = hidden_size + self.head_size = head_size + self.total_num_heads = total_num_heads + if total_num_kv_heads is None: + total_num_kv_heads = total_num_heads + self.total_num_kv_heads = total_num_kv_heads + # Divide the weight matrix along the last dimension. + tp_size = get_tensor_model_parallel_world_size() + self.num_heads = divide(self.total_num_heads, tp_size) + if tp_size >= self.total_num_kv_heads: + self.num_kv_heads = 1 + self.num_kv_head_replicas = divide(tp_size, + self.total_num_kv_heads) + else: + self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) + self.num_kv_head_replicas = 1 + input_size = self.hidden_size + output_size = (self.num_heads + + 2 * self.num_kv_heads) * tp_size * self.head_size + super().__init__(input_size, output_size, bias, False, skip_bias_add, + params_dtype, linear_method) + + def weight_loader(self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + param_data = param.data + output_dim = getattr(param, "output_dim", None) + + if loaded_shard_id is None: + # Loaded weight is already packed. + if output_dim is None: + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + shard_offsets = [ + # (shard_id, shard_offset, shard_size) + ("q", 0, self.total_num_heads * self.head_size), + ("k", self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size), + ("v", (self.total_num_heads + self.total_num_kv_heads) * + self.head_size, self.total_num_kv_heads * self.head_size), + ] + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.pack_factor + shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + tp_rank = get_tensor_model_parallel_rank() + assert loaded_shard_id in ["q", "k", "v"] + if output_dim is not None: + if loaded_shard_id == "q": + shard_offset = 0 + shard_size = self.num_heads * self.head_size + elif loaded_shard_id == "k": + shard_offset = self.num_heads * self.head_size + shard_size = self.num_kv_heads * self.head_size + elif loaded_shard_id == "v": + shard_offset = (self.num_heads + + self.num_kv_heads) * self.head_size + shard_size = self.num_kv_heads * self.head_size + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.pack_factor + shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + if loaded_shard_id == "q": + shard_id = tp_rank + else: + shard_id = tp_rank // self.num_kv_head_replicas + start_idx = shard_id * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "QKVParallelLinear, assume the weight is the same " + "for all partitions.") + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +class RowParallelLinear(torch.nn.Module): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already + split across the GPUs and we do not split + again. + skip_bias_add: This was added to enable performance optimization where + bias can be fused with other element-wise operations. + We skip adding bias but instead return it. + params_dtype: Data type for the parameters. + linear_method: (Maybe quantized) linear method. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + # Divide the weight matrix along the last dimension. + self.tp_size = get_tensor_model_parallel_world_size() + self.input_size_per_partition = divide(input_size, self.tp_size) + self.skip_bias_add = skip_bias_add + if linear_method is None: + linear_method = UnquantizedLinearMethod() + self.linear_method = linear_method + self.linear_weights = self.linear_method.create_weights( + self.input_size_per_partition, self.output_size, self.input_size, + self.output_size, self.params_dtype) + for name, weight in self.linear_weights.items(): + if isinstance(weight, torch.Tensor): + self.register_parameter(name, weight) + set_weight_attrs(weight, {"weight_loader": self.weight_loader}) + + if not reduce_results and (bias and not skip_bias_add): + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + input_dim = getattr(param, "input_dim", None) + param_data = param.data + if input_dim is not None: + shard_size = param_data.shape[input_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(input_dim, start_idx, + shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def forward(self, input_): + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + output_parallel = self.linear_method.apply_weights( + self.linear_weights, input_parallel) + if self.reduce_results and self.tp_size > 1: + output_ = tensor_model_parallel_all_reduce(output_parallel) + else: + output_ = output_parallel + + if not self.skip_bias_add: + output = output_ + self.bias if self.bias is not None else output_ + output_bias = None + else: + output = output_ + output_bias = self.bias + return output, output_bias + + +# ↓ add for smoothquant +class QuantMergedColumnParallelLinear(MergedColumnParallelLinear): + + def __init__( + self, + input_size: int, + output_sizes: List[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + linear_method: Optional[LinearMethodBase] = None, + dequant_scale: float = 1.0, + ): + super().__init__(input_size,output_sizes,bias,gather_output, + skip_bias_add,params_dtype,linear_method) + self.register_parameter("dequant_scale", + torch.nn.Parameter( + torch.tensor(dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + + def _apply(self, fn): + super()._apply(fn) + self.dequant_scale.data = self.dequant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(torch.float32) + return self + + def forward(self, input_): + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + output_parallel = self.linear_method.apply_weights( + self.linear_weights, input_, bias, scale=None, dequant_scale=1.0) + if self.gather_output: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + +class QuantQKVParallelLinear(QKVParallelLinear): + + def __init__( + self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: Optional[int] = None, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + linear_method: Optional[LinearMethodBase] = None, + q_dequant_scale: float = 1.0, + k_dequant_scale: float = 1.0, + v_dequant_scale: float = 1.0, + ): + super().__init__(hidden_size,head_size,total_num_heads,total_num_kv_heads, + bias,skip_bias_add,params_dtype,linear_method) + self.register_parameter( + "q_dequant_scale", + torch.nn.Parameter( + torch.tensor(q_dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + self.register_parameter( + "k_dequant_scale", + torch.nn.Parameter( + torch.tensor(k_dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + self.register_parameter( + "v_dequant_scale", + torch.nn.Parameter( + torch.tensor(v_dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + + def _apply(self, fn): + super()._apply(fn) + self.q_dequant_scale.data = self.q_dequant_scale.cpu() + self.k_dequant_scale.data = self.k_dequant_scale.cpu() + self.v_dequant_scale.data = self.v_dequant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.q_dequant_scale.data = self.q_dequant_scale.to(*args, **kwargs) + self.q_dequant_scale.data = self.q_dequant_scale.to(torch.float32) + self.k_dequant_scale.data = self.k_dequant_scale.to(*args, **kwargs) + self.k_dequant_scale.data = self.k_dequant_scale.to(torch.float32) + self.v_dequant_scale.data = self.v_dequant_scale.to(*args, **kwargs) + self.v_dequant_scale.data = self.v_dequant_scale.to(torch.float32) + return self + + def forward(self, input_): + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + output_parallel = self.linear_method.apply_weights( + self.linear_weights, input_, bias, scale=None, dequant_scale=1.0) + if self.gather_output: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + +class QuantRowParallelLinear(RowParallelLinear): + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + linear_method: Optional[LinearMethodBase] = None, + dequant_scale: float = 1.0, + ): + super().__init__(input_size,output_size,bias,input_is_parallel, + skip_bias_add,params_dtype,reduce_results,linear_method) + self.register_parameter( + "dequant_scale", + torch.nn.Parameter( + torch.tensor(dequant_scale,dtype=torch.float32,requires_grad=False)) + ) + + def _apply(self, fn): + super()._apply(fn) + self.dequant_scale.data = self.dequant_scale.cpu() + return self + + def to(self, *args, **kwargs): + super().to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(*args, **kwargs) + self.dequant_scale.data = self.dequant_scale.to(torch.float32) + return self + + def forward(self, input_, scale=None): + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + output_parallel = self.linear_method.apply_weights( + self.linear_weights, input_parallel, self.bias, scale=scale, dequant_scale=self.dequant_scale.item(),is_row=True) + if self.reduce_results and self.tp_size > 1: + output_ = tensor_model_parallel_all_reduce(output_parallel) + else: + output_ = output_parallel + + if not self.skip_bias_add: + output = output_ + self.bias if self.bias is not None else output_ + output_bias = None + else: + output = output_ + output_bias = self.bias + return output, output_bias diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py new file mode 100644 index 0000000..caf7283 --- /dev/null +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -0,0 +1,28 @@ +from typing import Type + +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.awq import AWQConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig +from vllm.model_executor.layers.quantization.marlin import MarlinConfig +from vllm.model_executor.layers.quantization.smoothquant import SmoothQuantConfig + +_QUANTIZATION_CONFIG_REGISTRY = { + "awq": AWQConfig, + "gptq": GPTQConfig, + "squeezellm": SqueezeLLMConfig, + "marlin": MarlinConfig, + "smoothquant": SmoothQuantConfig, +} + + +def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: + if quantization not in _QUANTIZATION_CONFIG_REGISTRY: + raise ValueError(f"Invalid quantization method: {quantization}") + return _QUANTIZATION_CONFIG_REGISTRY[quantization] + + +__all__ = [ + "QuantizationConfig", + "get_quantization_config", +] diff --git a/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73dea1f841604c9f4a0f14b8d342b7f4d7e7952e GIT binary patch literal 1105 zcmb7C-EPw`6t?r9Ch2~VK)V>4xT}C8(mKwd)2;yV z3S0mWz!M?m772-00108IWea!3mXE(<`#XO=x#zWj7^@$?#@|%{zDcDy452&}t9UO2 z1{hM1LIk123RO}SsWn!qmg-2au||#5L}rb3YNa-^Yi!U~>L91aCUsK}c@Pktn8YRy z@v!wtM{R7A1>7Q#s4L)D!wz=eX=o99*o_smgxh`4S@ab1XWz`-eG z4u=??gOA{|@)o{DK?(tv_|^Hx2toQ9)+bYfI?n zyiD=8Oq0HD&C8IDz;sEuJ1^z3YnaR%`STK|Eym3MDn_FpnI)La5u#EH&gxQEW*A14 shGE?T4bhg*tox$eB_$7JpB6WSaDl71@b6Sq1x}~&>#D7o3bcOz0jfi1W?Icl?Hj_?q!9d(41qCGN zT`;ydbPsiU%=FkplNl$s{0IF{I)iI3^*Pf^8e9Fn1tn2(l=f6`_U-P&+po9p;|Jy| zD}jaQ$-n%4_^-2;^$+SSKOCLwDEbbFu!NmiBRjH3EMg-kaz;GjHt|{JMlNb6^G3C( zW_UdFNA;*~+HMw%8c}1^jGCtJWhq(L3<0RYc*k%3QL?t6xNQI7*PjrWs?YGmh zOw>EF*v`d`M9Z?RWf9+&>2SM!fWqMlW~#G5}NF0J|3jQJ}tTY_~=|m(SHMx zR%8n+V#1Cb!6X+B3YV^Mg*Wick$B_@SJnp19EeB$qSp}3k9kx_4@cZ-Ua%msB3j_B zfXC5W6>aoda@EMxjKnFi_A$R_M{RjZoR({{CQlF81q0ew^NuQ0aHoO|{kZNaI3 z(SjTo7S8ByKA`Dlzq~nBI6{%_UJ3JYwj2KR8Eh`B_8Hzf^+Fj%f-HaI5# zrKQeb*w-KQrdsu~yq{#fEZyo&c44FAYu#tB_H&1`Oadwj3$%+?O_?$+{#mE2;7R%JL%sh{i9~6`DZnw_1W2jDW6_4hF4}QSHeSkFx^9wc;#!m z({YZg$X2v(qo;}>I9YTuB_}G5FQca^as@aK-{Z`Gbo4xx*kk9RGtqgzBKr_FcFi^; z2YRa(xd85CYYq`E#t17v!b{CSxV^mA{_7Lh(iAufXl#h}zO*+Hlw90ke>>va35?_0 z7Cle19vEZ{Yl=m&JCXXwnCmOrgm2F-A85OKlK6?6QcqwG9R9&us82rp8cjS($8kTI zBwJ~g7Q4@2K&1vYyL5C1M+F}XptNmcOr#KD3pS+K?rhFZ87q$aChm53bUvP~FAQ5Q zbC<4$9H{CV!&s+%R^#JAnn}7Z{VeU&17Qx#{?Y8Iqq{vWUV0U$Py5_|Tr{YdeQ%&K zx9%f6RlEP3-GwWdz9aaMp%0fZeRpW1hrom&=FIs$2PgG5BlmWUKvHswCQ!20E6u7O5UXMG_=4k*B;A~%z@<7wD0(cJ zjBz5e8`S`zq-Nraj|j(H#@Hh`&O^r8H_Q$G-D+-Oy-z~9!*jm&;u*>+gzWk0zuP|j;=s5cCLAK`2t`oo=6o>5>=#xWXlWrv^e2aa5$)JEQys(3{OV7N;t-{`xHd? zpE?=`u;8EQ(8+d?-p;}^0nnNS6O3RP!8h=o1^+zw2CfynaCbb6TD-q#pKU zpHlp8Y%0PJvw{r6XfGs{6LuT}Bqmup90NVxPK)iZfXIFd1Iv7EgnB!f$S&*(Bl^iW z+>)rK8e_J0!y?baEKx%l7ILfs#pxQdOlD@0?rch{R`>LOTowWTIh^@TN=i!Jnx;x> zWUWLzAXNb1)+;j->8K1AR)lsG5O_s2MO&RFNv2*#rCy=7Q`T-(Dk;5NgamcznV_mW zb!A+^U~7V}X0RgrTZBP5niRW{i?Dg^IYqXJA;yKCCz%(BP~uZB5ut>w=z~I;)N`F$ zCg>SrGG?oZfvuxRVve^eTM)@ z+`Svrt+^wtsjW8V!Q8>9U>@9e=KklDp54t#@z=2xqx}bacsgEnzhn}nYy2Uq!KBDB z6m3890mkE@9F01S>Ru^=s@I9ULB!nVb!wR_rHP_C3@@%G3l@`!_zwO7m=?#cdnK7n zvfcgL{xe2v%Ctc_IKe}9gFg(w=ivF^ai{gL0w||S4_^}LcvvlN_F`a$S(!E&tr^VJ9gn*v70@3l z#s#AqeTjDCo7Gi)-}H`tmv*b~(r$(O6d`@%rO~zT5MBgI^XE%<3;)q9puD2QS*g*W e%x}?1>Aze-tXC->={CrD%V$m8Vw;MyzVRObZj$l< literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..785ca1f9c63e987a70e6185707a7466ab34855ea GIT binary patch literal 2789 zcmai0TW=dh6yDig+iS;8olDae)G|mZ#Ufp|LRAq|sA*a%q%>;j1!*N%ZFa`-(!Dq{ zvjJP-h4w%2fcOFZLm=%VJn_sMFL2JRo!DtiR^H>8GiPSb^*fujS`7o9%Rm3VS9c8K zPn=9YHcalqE4P6#gOS)sh({9BGYQVk*h*~APAXm{sd`n-v*MapgEc#LlDbzX#zTWu zSoO8Rs+_#CyauzL8lBo-aH7#M3v2V!49{f5v$)MnUg??4V)kqM74fQ?Q{&Ee z^-BiuQA6Z6G=Bj&HRfnelefmQ6)jn34djFDB0mQ!3%t!Me5q$%0)H^0XR{`2jU|?O zYi50s*EpO4x-9jo>>O+B8&>rBt8nKMTh^TOGdHfVRn1*v=fQJp#qyIpNL4fnRFtJ# zS=x*CM(gMg8peT?w3i9GKP5s#EhF8cD!9r;8WM2N~Up~+&83H*~!$G$spzlPZehmgvbQ2ZOP-_8$+Cf)BgFkn@iD+voMIeakSeV z3?ann$L`G=T^T8UeGr5%gFP<02XUNqlZ^4$=Rfc;R~c*zhFr+*v^Cvbu+%S2atFi0 z+2hJjLDwc4U;?qN(YtTE$eqZ|wO(K`faU9$aA>*A2ZKz2VH8p`I?(>6ObrKIx?%x@ zMGFa#B(N-U}`sJP_lv@Te`e=@Nr|mjZ-9H{hX_Vv%efan}4TC|j8^w_t zQk6{%m>a@IgN~mE?w-3rZ{N_#^CF8?0JB?q_AD1sy0<|;DHk`B0TNh zL@lEqb^Uypr4h*q)XdD92nqb{iW8YNJk`Bo)0I0uOuL!`FHM;qbE3ok( zh#?2@vNMevt!xEpnkm|gQmFKFh^0STr_(W=;}>X3jTd;6M!t?!`sN+byRg8vlU{0J z(6s~5@C~#PA;+;C(j?};ZF59V@z~6!GtuA<+K=&!b+a6o5~eFuoz7KU+%MY$l;K!R z1MUBS+7fC}%KBw>-U7`_G+Upe^o;nKiXVKXdlOU^=r@J8y*vf*%~Cq9yMCak>}Pol z5W?volj^#J-olxCjAmaz2TZf?H{dKlBlu-$JUoJ(&99#v!`FOh(Ir|QtwCJh|0xDd zgG+U8iJsiE)45PMzV1=JUo?ClN-d8$jGMj>k%;kRnALorWufm2tb)RVFCw8^Mir+O z0FBf)PaaRteHNKOGM z0iT3M9bWk%5XZ2IX|{=JufYrdb(7%8Bv*j06Vt)9i$w40EEHC-8w&JPX-*Z|CbW@K y3#|{DC9VST-pAJO&bEHHZ2fMTM5VT<&uNMl&YjYwpCjn$nra)X#4(%Zs`VfD;oflo literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bbedef357488a5e000358443f823c0afbf0f60b GIT binary patch literal 5816 zcmcgwOLH5?5#F7BV(}nE(vnEaHXO$(BP9cp=!$JcawJQ#qLd83Bs+Khdo`&CZfBr}N%jY!hUsRd>nW$XC6aN*2Yg~^spWZCey1K9HR5l{kHGHFM`exVi zt*-6cs@;s7uIsz1tVcON2VN`kx_LjZ+U%&%E&4@OcA`>u!C&Z>{j#dN(XsBLzo={1 zHJ;<%=Nk9K;*Q}jp`Paj)C)pKeHry4FQHx%3#dQC^^dgb!XBj68dW_tul0uAs+k&> zdt0e_wbM*e^JXVbQun<<(&_iY2u$ufGHg-}<=p$B47(x`5*4FAh^s6u-|X~6DBl&y zMxVbO#v(1pA_+bdo%Ti&gh?V}FfV>E40}mu6w*vr`n^`C-6Sz)e=aJQ@Wg)y5t^@a z&1YQq4bFtg4Lqi>xW(<3%N_21Zu&O2gwtZ`5KP|{xk;T4!Q9b(PqpMlVY9fRfsYP@ z+Doe4M9Ts%tCoccr=$3e@kP~IM(dI|CLFQYVk=M(*IFiD=Fh0!CH^cw!B0YtWvrVQ z+1M4nik4@%jiWx5de`5-^+C4Z(ar2#X+~ijS6Y2pq0XA;Rq*>zw1gDBrZ`{ONRmN( zzFrUIz0Rk#zHHaS^|*fS+}Sg=v#+n6TT}dLesaR=og_}Zw(JiFLEIUM)a4>>2GCh* zOOXs^uhAq!o&Kmy&lNoJC6KW`)&gzJK=rW!$`XU4ZByH!Hj_%6eX7gn6N{sNQ~Ofm z=3k5*{kPB%c4;;CDwwBg?lHXf3?j6`9w~J6-j$)m#*%33bfw>mwkqf9YxURaZ@db@ zE7=|@cV~>F(uphaa4_i0MDYDnI$c>GCY5$Ssf_FjZMjypWeM}k1tMi44oI3Ggv~oa z3)}C@)cZJ$hT@u(eF-x+V-dCH27Rr`RSbLZNqrd0dem=*Q9bIc*9TiL=iXZF?3sGp zNyMu(YuFZX{nIGw*1LT!qCnge%^{YlN8y%`aea0d_4XjSTN`YpZV+^Oog@g3W0{zU zZQHu7mvm2O_?x=@#KeEJbYLm9>6oSt{^^g(v_+-3=TXR%1Z6-?n>ya49Hx}xD7B1E z9L)L3jJaGK6;D?VEau`Uf4VX?nu`zgYSoxOC0(vyL3LCac@|aLn-l{X1TUi&ldtr! zAMA#?kLUK2@g3O@?TPmDZ?YHVomj&+>Fm@pU&G5Zqxi_WZ8(@79MKOkDkh^d*`f8O z{ZCylCeuuCYHfJLuk{A}m<3HXH+*v=gps_6?)gt*5-Z_O7xdBbnT^gcre$iMjg?0msbnEA(IY!%vR16Lfnkfi_OWg`rADx@eg@G`~gUW$Lrkos~bJhF6GG ziO_zfLF6daz{J!S@b#7y=Lv%H6;1!k3XW*o;; zX3SW|`;_k4#|<^QiGhcHr20#EV$${$`Uqp!ys!DzwXVk0B@0(MTy=(VjjJL0|}{F{sJ%JmM<$a8vQc6WR*<+ z`v_u!vo6bad|n|aLS4$qP5jpR%B>BdAlYF&l3ny#hPn!~BM$j0QXxH8=p-l>X1IzF z()14h1g)@m6-uc*bMWfS$&5e2qb{$fk5=Yg#y&7ePj-5PA;J*=Uoa3d7=$wE5VlTN z`orWYUEaZQ##ua5H?m35VEQ8k6Y`)75+`sHzcI)99{>b5_2UQ`cVN^B zI~vx}C*1mWF7cB5W?>?cu?okJdXZaS7#zm-1>5$L(%9oVcX=kY!E;}kJ51Nq z$c5i)EuDKj|GAr#$NG+b6^?f@1Dni{#|(MQP~gR{*#5}E*q!coRhwJ0fDwgp4r{vE zi5yx}jU3}j?8{gLe^K$5vK7YOIFA{MV}~z%Wx>454N3lal#rP!O;$+#gqM53#bMNG z_kfr`>m(bM1T514jwY3hRpO0sAZoC3jA(|v%DTX77-P)(Rwe28D^V!hqLPSSjNF7w zB8o($7O9Pfv}$HE|Hozbp5TY*_Xalp4#jLK?hZRr#6eqze2>lpz&@ARHScs&Z(`Oc z`yPRIL?)Ew32+_?cXUo4JnPlB`oE`IH9!?H4~Th;!=?lN3n@`<>$>cUd80j$Vs zXHva06OQTnrrw1J$#b*c9lZ5N)RpJ7HO9Or_LI@tk-OZK0*-r4AA!jMC5>aW~`zZNKO6(x)@qqTBe4KxB zu5n-E@+OUbmq>%idqh+iQ-DP&rgZqIeXzv4Ak0OKrX1|aqM5x57BU!S>n`$`W!B4750vI9-Z$Wc^Sh-nt&;Uwio@RLo38HNp){Aq zI20_nHq>P_MhQ|oV`o<%TleYnmea^gf?%j5gPJpr>OyKMPv|@FBjgfi3|?JwvU+9K zz5|B{Uznr}_u@5bev1elsb8QhHHbHctyUyd#^C2vQ(JVqKw~P(e@yHY(&f*IkgfVT zxG0Rn!oD|=fCsp>ndh3r1ZgoG45F>6m-rhdQ2C~7uIh_=$#9W$!1$iH2Kp5Frlc$b&bLwW?OQ5% zNJ0&~n!-NxO;gf!@g4fY zQwy$ST7FU04u9{dW#4;hS<)p5>t!x)?K_w!(O7-|tgYiS=>&Dk0u5J(OyD}p^A65S jm6Q{6FJuYPTNLzE#B#NQeiC|iq3;4K(_@z3Tz38ktR`xg literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..086900170f5767bda0fd319a8aa9ff08c88867ae GIT binary patch literal 5354 zcmb7IOOxBi5e5big5YDR)k@aNvJJ~o%1kWMk`midY?mH(Y$x)@v7HwVL18&VE><8& z4?s!Fa8s$sIr^5W9BikgqpR`@Qn}|3q$*Xo=CtQrV#|{9^#I)E?nCc< zyXPB@8Vy&&^VFaJH8}UAru~~Li;s!QCA{e^5Uz1O)`q&T51G$~hHnf_-_)s{#g=d3 zYsB`j;#X9g89T$OUsYu*c84{;Hmv(~Rk!2Du<18-?JbR0xbv~b9nrjP_)Dl)xr@3h zYN)q3`#@{gb}+lvHgn^0vXz@}M7=CGuSIE=J3k#~(I^SywYHvD-wS0p6q%5y7^898 zW_jycl!#EiE3%CdzY?Y*Z>1s&Zi;BIkp*Fv$rQ}XKbwR}7EMDEc6F5Wqd||RTzo84 zF5yl84k9#P=bF#B?i-v5lN;RZo802|$0iJ62)obJ!c4y+oU%>}GjHpDRkdJ&oAp%< zd^KKI?G4p#qNTx`s-;T#frUN9Hyz)A4#w1?zpap=Y7!^QY1Zb(c8$f zaeA@a9Yop2WWCcH4ZH7r@Zs4vE`M+>T<;E*&~7G$Ksm)esPVZwkfX^sNTaF9Z7H&e zOxAh~R-8XnHt=4>o4yY+(`QsGKGi;FxAi~h+yp1%7O25( zP?OVGOSRa%f^vmxShTaY^Lt3%d5CA)yE>6rQWkH$;Ej@a%TpS9q@j22{3~9VaPRCn z@A=+Fm?R>;8Kt7_MX8rg#^aI91YfYy3*P!9^9CclyUX-OeQ(F1=~_fCfkYZ&sqM%* z%DMZCFrJ7vr5wq;HV%8&gMQe{P^o57EDB3ChEWnELAD`9$kSX^ukTjt!|+xxhQG(L zh;wHwXu0*^OS5%<+JUuY6Bt$p?D?j)7nc+?ByZXB(5$6E-7Enq%%UATj~nq=y`%?O*pHp}b`^M(D$698{J0m6!}TbRvaKJ?l`xIz zGl!Je&->WvQyWsaG734xO6i><=&9V@*-S@CVU|Z}wL)4IoIxM)MthXLKG(};(`OIq zb&xx^i}|Z5#45 zM#=9GA?H81vo*z9%zOp?Db3boj=n!EIty(1aIf;1Qv}CrfKwOe>sw$@jt-5UK4imvtcP@%N10mF02y1&UrhI?+0`bd zNNIvA7gwA33@_G=!yW+B^5(IqI zL$eVjnS2TZb0f{9e1=Ljb$}M8kk651+8=p_`YaWK^4rw#JdrjL@)$|hm)|9FmI!62 z1yd`dlf7jPgx-|KJDRC8i!t_q8See&Q_N`T%s7s>#h8H^IV+otvcq@1vyj4-zbO6n&z!f#Z9{qZOUi^Bs!H@FhUJqXaZ-ci) zlfFPHE`9qzE`0f8%U|LRM8b-){WK`N(<@B3aJH5RwYS()kRcZ&7}PU5TZDrO$g22iaWHNYrWjj9#x z2SydLs=R(wJE?8k&~858Z!xk#8fPxnTr1^PPa+K~N!1lXn9%s7#+%@%^QBKr-r`3- zVcV6gF{^L}P`zC66eIZ2&#c=_FUFC+O#YQGC)QkNXyBByIWxwaGiyO+6*8+chc9E) zKe(3YO8&}cOi3*55A8^&=8CrMDoIN-S83ORB%{=>Hmg8VjUW5WKoA{Yn|_C)pzsAx zB^qxyNi%OQ=3X6Iho1F4b{?ew`{stb^};7t47(RAMwCYDvGCTn zynV0~Y`NZ|qZ%u_9rSbyG?Rt^?UZGCHZ}K9M9};;L#9vM(L#1w7>?n+wS3WB4*nYiOwlaB| z+v*Tb{bjnp>@Gp&L-{5NJODD9JzRe>QC`^7Ux6k43kY*u#&idljF!pt75W%Q_C&NUSZ_ny? z%$adlM-Hqb_f*y-DJb6{qV|tMQ{@!sLNzu=@`BOsjdXK)XkB8&|?8859Z_G^YZqYW6dz`pfE`*-4U) zI#}p$HI*^^3jIyc39c5K`K+q#|I?sT{AtjM)#X4E1W}pAzwiyr#tL)O6soI&0~5y=AOi{U4iwHX#52 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/quantization/__pycache__/smoothquant.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/smoothquant.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..faf7b3d2a9bc89ff0b4bb94d6e247138c52a51c4 GIT binary patch literal 4733 zcmbUlO>Y~=b!Ya2%jJhCS^kheQo452Fty3p23j@9w-d*0ZMjW?9yUc6tDPaW(vr)} z3@wwOE|rU%9JHq%niO`?i~m9YNrAogQ217$m)f=CzBfxslqvVn0r+;_%zLwM-uE)C zR6GO5nLqrk^Q#5J_&XxA!vf?z_{qBfn88S5bcs*8)Tdq3H@lW^5nQKbaeG-I8DGmi@BEoutyO`c*wIBn#b|Un9oH26I{QuEC1Dx@-E2fP1V2 zc!`$)Ut;uAqfveYk{AsuGvDtGGwZ`RQkiulmMU{^^;Mkq!ep&Mvf|G}5q7!a0uVFp z%LdJAH{u=-#Z9g@QuaY8c~+BL1=~FCY^WeqN=P7H`o$pZsdyCPVOP^$JMO^h`3_Jj z-jk^ax+&vH(1#Z!2~4(Ak+49v!ejWd+?Kg1;7oTFvF*e_-2u^Q_SQRv$)M{ z=CmDFVD4SpcbLuXHf2TT-L?F}1S_#JU@l-q?roM<44|m6Dy)@Z%>rzJ)c~vTs;03u z#Ufj}YwZ$$f!Ej(zQ}F9)TS#2yw7M`Y?&R^yGPh*cAT96?&TTY6?PI{U^S&x!X%6dB$^%)wk4&xKv=7RSk{$_ncslI%()#}7*W3b+g(r&98 ztBV_9?_v_Jx1>J(3)H*j%$!`)S!rUM^;pTw(*%JU_Ic(At_GsF7MY;f>A*QU>{a;5 z>j1`NYy`%b0)*KnxbK!JmK9~NvT6Jg-1QrCm;46z2(w`CyRc^?276Av#|+RwVeOF% z=Z&sk9SFFRN`@EeX)hVpw^zf}x|VC=F!fm0<)GhBh2rc*wOpvL4^+LAs`|*Ol|LgQ4b=1-U)@H*LRd|_zFgG&qmtivC9|@6)hM8HCC++>+hlW@N zq9?y@4Wwu#X%r@{BwlazhY&cuE6q!9v}COK#eNuVg&i(ix03`6s-uG6;n6^)z!na< zkgZwW%-XrxA7*Y4#JyMr!4kYpz5>9p9pX_+D6xq1+ zpAMYc@Y9ZP3?@vQAB>8IN8_5`*kD#`ZS>0Ak+048+}oqdgbL$yXx0=X9V$?nM`xa%|j(RGZ)a0o{QZ)40j34h7sCV+uBR zv3lEUzP%kl>dP$5e>xfY&KEkm8pN-l>}`>DgDpOkSmdKbitoWY_JeNb=K8)RO`prT z3Y7YqHCiJc8J(HG<_vYVlyA?r1#>=T!jUtvpvW%R`CcB6&=+|7)gLlL10^jXI4m9( z@pg&sK>}z^pb2>gNzVcpotu|*j(SeEssBqfNe2p*ID-djhd4|C?HUr&IcGW=wPU`I zEII^U;uJ43G>EQRaNQcR;+-4EFzMlOL6CU?c1VK+=j9*(?@cB5A_r6zqwE$?Y{v1%=wRg;>_uy>kx_Ayg)MZj=iU_Hel ze+OXN<9%tgbwJ#8eRI<1S=gyD2Yv-^Z#vB71@7`9c4Y1Hq~G(3rDC~QVaHfi@7kLs z-LWkIM~T&V8K2O>E$+>hcm;a9Dx8M2Ro&Iu{=%2g2>CS#xZ~PBbMcFMM#1H)0FO8kyc0BC&lVQMkp18s`wA_J6 zx6y$8A#YfDiNrjSS)u62%-Pz;=R;`HgFssl>gB#^&?7L9>o{J5D26ybH6N&Zl|MTX z_xb}UG|+$tAQm>}D#qqLTS*7%C0pJ;Q&yTG%gWf-cV%9ww1-Dn!KVW|dc<`~z;vSw zMK&oUI08=-QiI7TGw;zIM_FTsfz_GwHQlA4ROl=Nn-#`RjOoK&dq~6;IN=&5J^}iJ zDjKAiKS={x#iJ&mgoV-nGI0D6)#0>BD)zG&fx>iLImwybPQOHf8 zlg+!h`W^xu_aETwLj?O#qO)o>5TRd|Zza2+hpcggDpn;7LK5DGnG2hmf6d%EcZ-X7LTo$)rE>I2r z;-q+ltHJhakY2eQ2y{W;({_i_vcV z09wwB4B>lupk8!{N*?DE>wOM<5+l_q*1!kOlPZB6NwDmKcaYKKV{o>zHV+)oFxSM# zz<;f=D0CCn1VpqD=#p>=XIKcd!h}xlml4$&{u<1D=QI5|E^J)>4gm0{G)=^3IIT=l zOT?38nqIVvzlGGg24LYrJxdG%*C-Qe?^gi&6tMqT4t?SkyvOvY{tkIuh6-Upg>WtU z)P;Qt`?NqGTkg`o^Fnw8zFmj?xYx-A)pF~irlrIS`AfQeZuWXjooetu3LV3K0l%Y~ z!PVT@XEk>4tE!oQRW)-z5ZH9;IQOso58_GtsQ=GsuEXCRlkQM>D9xqAIQRbB7^par xT2B2}2bN%-H#k^9^L}SPbLg2)AMRUu3-Atly-pk_440Nk862bx&ziZs`d=0Uu^s>b literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/quantization/__pycache__/squeezellm.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/squeezellm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49bf264fb7a66dbac97ce428322bbe7223487517 GIT binary patch literal 4529 zcmaJ_OOG4J5uWaOa5#J{_o1~`k>ZK%BxWp;dN^_m^G z40x{m{a?Lj4a4|1b`~ECI`^Q+=TI?&k;Leefb?lV`(|MFt-vBkr->cd@HP{tUkXZ^ zW+iUF9F%q2PP~34sPwBrRrj5w)~^S3-7Y0d{YKCr#vcskvhs6-m3e*743?qqu?qAn zyaN3dM!zze)qRk}Xj-{>KON=PCvivR*27q;-2Hr@;w+7l%_hmqe~LuZ=ZXvHnAt!! zY2J7kr#up$akZVXk0Z(RhU6-I!sFhy3L~Y21m=}5hEb~GafFLK$kJ}y1F93ta62A! zaP!5-hR!`G@&~B65fEksl##$>lv~VXR@Y)SbDmqE9+Nv=%1X?IUTNAZGY@($^vc}Z zscaa)QDIe}Re@$fug2=otMR(#aWuygYdp91NU+2kY?&`}hp%+$2B@DIu-^(>)uXFy zgI!?jFneub?i=hPP%bbFlzAy{eD!$9`ItX^_*rq*@t4Kh?Ax|-{iA=ZHT^n(l#)@CV`B9I{_BTn=Z}&6ClaN2NASmTYk5exgaW^+0O_-7BcO(@z7b!ad!gH}5I z-UJn!o8v1dS9*8BxW9c(noUxiod^_1E1;+@Ko?~dd?3Qmhn_^U*U6dE6Y;_R^Pwm`;%btQ$cpIl{z@4@8jO*N;r3>?wu|s;W`zx{u7PrAp zo$bKdj==C|?R^nfJwjRtNC}x54Y>TpOcs}nnrQp z6kUysH597g47+i{(Nyv(@Mx4cbo}!(1v@z-cM8fe_hP~_NA6`EIuqjsxEK`lCEW$b zGNF3~mU8AhY5FHmU@_$C*aGHtKIs1dX+`O?r~QNd@{MeahxKx)59@ z40A6G!C4LyY*)h&z?Mu$N@2*d4v@__RRS%Zn^Fm}f$RJPD?EVsDGu2BDvCFe@D>uP zQbF|Z6#5eAF7k>Fhf@loNuWK%FR{{B=U3R8BU-l*7orLk;zi=pu3-^sQ%ZlNrgu=U z)y*}tZc=lNQ1deJtaVfKcYyWSgU7!IMUJ65nPxze0Eq?!b(`7TVGfiM6cw_A7M+tTVoAg$2TMs9=^LSTV^ZI?O=)3z}K#68;pAeQvE_7 zCj#d*Nw_b1@~#g5m@+K-fxG_K+k7FbI3ALV8`LTihw9?IV>cSrk40YzOk#_K0TY4Q zrFb8x0398Lu)Sa&Z5c#`o{sByWc=p*J}lnEqez)wjMKpo{CGU(;ed;95D67ynwW27 zLv_xOH=mSO7TEGCOpp4q2m~779dw?v2ct|0ik9$i(7*%kK-gI?GNP)Y44l?8qD*D& z*t1c{hbZ(NW_}BxVD`6k*HNX3!w7>?tiVd&S$mWi)0sMt0+_g~#ssp|qj@EmS7K#G zVuIWg2YTLjn1ok01v-MULPP5F`KT03r&36mq9pF6kWf8|)wZvIMOzUVfTWXcUv5VO z-U1xJj82sLTO3|P2@G2!UuBt}M54!i#Zw6hC@LyRc%nCHy&~cUZ2l>_Xu%&3W5H$E z6A{}-nVMEmE;P>Ke(p^b4(-6hEDPKaw8aW=?xWeohd77krN@P2c{RzhN5es=qOF8C z%R&PO*~KzeXrZas;QfIb1vdDXTQ@}+>3GeXZUH#$W^W_)1FW>myMwKdu=*WV+DGUL z;h5Juf=7x^zeglSaQO*T4NGToht#r&r*k^%fZEP$>LML~4zd1g9nuo`mV!f%TP|jQ z6lOa=F2>Gd*gn4JW?m}PAan9Gx3=O)cF<4G9=vC$0gcIjLIXqShOukG8=T9oJ)w}V zLx%oW=uIqWo8WTD8^*3Pp)UwC^*GQV=#~`SGsGQb?pPBCYMWVrRaZGL0Cf{<;!fE*VDV^9JhAudG)$lAz(J?0{lx!p~&NC~`-!IC`6y~7^w_j9VUprjdY+-`b> z^*_cHP)h1M@?6U$m&gj( zzES+l25}+V06+H5IezTeLWPrTi1$!F%(BEatXf!M$kH)Ww6TRjQeQb?VhfX-BKqBj zR^WWCFR8#UVev<-=D0orw+-z3&caqqz4|vMEag}04P7SfaU_t zGV2FL;9t!hNSWibS8T3j(Lr4Aj6w!0IuCE9X-oT7{QTE9HYnkTRV#c@nDA%1cl None: + self.weight_bits = weight_bits + self.group_size = group_size + self.zero_point = zero_point + + if self.weight_bits != 4: + raise ValueError( + "Currently, only 4-bit weight quantization is supported for " + f"AWQ, but got {self.weight_bits} bits.") + self.pack_factor = 32 // self.weight_bits + + def __repr__(self) -> str: + return (f"AWQConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"zero_point={self.zero_point})") + + def get_name(self) -> str: + return "awq" + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + return [torch.half] + + def get_min_capability(self) -> int: + # The AWQ kernel only supports Turing or newer GPUs. + return 75 + + @staticmethod + def get_config_filenames() -> List[str]: + return [ + "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq + "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + ] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": + weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) + group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) + zero_point = cls.get_from_keys(config, ["zero_point"]) + return cls(weight_bits, group_size, zero_point) + + def get_linear_method(self) -> "AWQLinearMethod": + return AWQLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] + + +class AWQLinearMethod(LinearMethodBase): + """Linear method for AWQ. + + Args: + quant_config: The AWQ quantization config. + """ + + def __init__(self, quant_config: AWQConfig): + self.quant_config = quant_config + + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + if input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + "The output size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + + qweight = Parameter( + torch.empty( + input_size_per_partition, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": self.quant_config.pack_factor, + }) + qzeros = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.group_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qzeros, { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": self.quant_config.pack_factor, + }) + scales = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.group_size, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(scales, { + "input_dim": 0, + "output_dim": 1, + }) + return { + "qweight": qweight, + "qzeros": qzeros, + "scales": scales, + } + + def apply_weights(self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + qweight = weights["qweight"] + scales = weights["scales"] + qzeros = weights["qzeros"] + pack_factor = self.quant_config.pack_factor + out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, )) + reshaped_x = x.reshape(-1, x.shape[-1]) + + out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, + pack_factor) + # TODO align + """ + # num_tokens >= threshold + FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256 + + if FP16_MATMUL_HEURISTIC_CONDITION: + out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0) + out = torch.matmul(reshaped_x, out) + else: + out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, + pack_factor) + """ + if bias is not None: + out = out + bias + return out.reshape(out_shape) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py new file mode 100644 index 0000000..6115e7c --- /dev/null +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -0,0 +1,64 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, List + +import torch + +from vllm.model_executor.layers.linear import LinearMethodBase + + +class QuantizationConfig(ABC): + """Base class for quantization configs.""" + + @abstractmethod + def get_name(self) -> str: + """Name of the quantization method.""" + raise NotImplementedError + + @abstractmethod + def get_supported_act_dtypes(self) -> List[torch.dtype]: + """List of supported activation dtypes.""" + raise NotImplementedError + + @abstractmethod + def get_min_capability(self) -> int: + """Minimum GPU capability to support the quantization method. + + E.g., 70 for Volta, 75 for Turing, 80 for Ampere. + This requirement is due to the custom CUDA kernels used by the + quantization method. + """ + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_config_filenames() -> List[str]: + """List of filenames to search for in the model directory.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": + """Create a config class from the model's quantization config.""" + raise NotImplementedError + + @staticmethod + def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: + """Get a value from the model's quantization config.""" + for key in keys: + if key in config: + return config[key] + raise ValueError(f"Cannot find any of {keys} in the model's " + "quantization config.") + + @abstractmethod + def get_linear_method(self) -> LinearMethodBase: + """Get the linear method to use for the quantized linear layer.""" + raise NotImplementedError + + @abstractmethod + def get_scaled_act_names(self) -> List[str]: + """Returns the activation function names that should be post-scaled. + + For now, this is only used by AWQ. + """ + raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py new file mode 100644 index 0000000..9d4f8da --- /dev/null +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -0,0 +1,218 @@ +import enum +from enum import Enum +from typing import Any, Dict, List, Optional +from fractions import Fraction + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import (LinearMethodBase, + set_weight_attrs) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + + +class GPTQConfig(QuantizationConfig): + """Config class for GPTQ. + + Reference: https://arxiv.org/abs/2210.17323 + """ + + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + ) -> None: + self.weight_bits = weight_bits + self.group_size = group_size + self.desc_act = desc_act + self.pack_factor = Fraction(32, self.weight_bits) + if self.weight_bits not in [2, 3, 4, 8]: + raise ValueError( + "Currently, only 2/3/4/8-bit weight quantization is supported for " + f"GPTQ, but got {self.weight_bits} bits.") + + def __repr__(self) -> str: + return (f"GPTQConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"desc_act={self.desc_act})") + + @classmethod + def get_name(cls) -> str: + return "gptq" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.half] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 60 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + desc_act = cls.get_from_keys(config, ["desc_act"]) + return cls(weight_bits, group_size, desc_act) + + def get_linear_method(self) -> "GPTQLinearMethod": + return GPTQLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class ExllamaState(Enum): + + UNUSED = enum.auto() + UNINITIALIZED = enum.auto() + READY = enum.auto() + + +class GPTQLinearMethod(LinearMethodBase): + """Linear method for GPTQ. + + Args: + quant_config: The GPTQ quantization config. + """ + + def __init__(self, quant_config: GPTQConfig): + self.quant_config = quant_config + + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + del output_size # Unused. + if input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + if output_size_per_partition % self.quant_config.pack_factor.numerator != 0: + raise ValueError( + "The output size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + exllama_state = ExllamaState.UNINITIALIZED + scale_and_zero_size = input_size // group_size + scale_and_zero_input_dim = None + if input_size != input_size_per_partition and self.quant_config.group_size != -1: + # For act-order models, we cannot use Exllama for row parallel layer + if self.quant_config.desc_act: + raise NotImplementedError() + exllama_state = ExllamaState.UNUSED + else: + # we need to partition qzeros and scales for exllama kernel + scale_and_zero_size = input_size_per_partition // group_size + scale_and_zero_input_dim = 0 + + qweight = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 0, + "pack_factor": self.quant_config.pack_factor, + }) + g_idx = Parameter( + torch.tensor( + [ + i // self.quant_config.group_size + for i in range(input_size_per_partition) + ], + dtype=torch.int32, + ), + requires_grad=False, + ) + # Ignore warning from fused linear layers such as QKVParallelLinear. + set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True}) + qzeros = Parameter( + torch.empty( + scale_and_zero_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qzeros, { + "input_dim": scale_and_zero_input_dim, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": self.quant_config.pack_factor, + }) + scales = Parameter( + torch.empty( + scale_and_zero_size, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(scales, { + "input_dim": scale_and_zero_input_dim, + "output_dim": 1, + }) + return { + "qweight": qweight, + "g_idx": g_idx, + "qzeros": qzeros, + "scales": scales, + "exllama_state": exllama_state, + } + + def apply_weights(self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + qweight = weights["qweight"] + out_shape = x.shape[:-1] + (qweight.shape[-1], ) + reshaped_x = x.reshape(-1, x.shape[-1]) + # exllama needs to shuffle the weight after the weight is loaded + # here we do the shuffle on first forward pass + if weights["exllama_state"] == ExllamaState.UNINITIALIZED: + if self.quant_config.desc_act: + weights["g_idx"] = torch.argsort(weights["g_idx"]).to( + torch.int) + else: + weights["g_idx"] = None + # TODO align + """ + weights["g_idx"] = torch.empty((1, 1), device="meta") + """ + weights["exllama_state"] = ExllamaState.READY + ops.gptq_shuffle(weights["qweight"], weights["g_idx"], + self.quant_config.weight_bits) + output = ops.gptq_gemm(reshaped_x, weights["qweight"], + weights["qzeros"], weights["scales"], + weights["g_idx"], + weights["exllama_state"] == ExllamaState.READY, + self.quant_config.weight_bits) + if bias is not None: + output = output + bias + return output.reshape(out_shape) diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py new file mode 100644 index 0000000..7566d78 --- /dev/null +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -0,0 +1,210 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + + +class MarlinConfig(QuantizationConfig): + """Config class for Marlin. + + Reference: https://github.com/IST-DASLab/marlin/tree/master + """ + + def __init__( + self, + group_size: int, + ) -> None: + # Group size for the quantization. + self.group_size = group_size + if self.group_size != 128 and self.group_size != -1: + raise ValueError( + "Currently, only group size 128 and -1 (channelwise) is supported for " + f"Marlin, but got group_size of {self.group_size}") + + # 4 Bits packed into 32 bit datatype. + self.pack_factor = 32 // 4 + + # Tile size used by marlin kernels. + self.tile_size = 16 + + # Min out_features dim + self.min_n_threads = 64 + + # Min in_features dim + self.min_k_threads = 128 + + # Max parallel problems to solve at once (improves large batch performance) + self.max_parallel = 16 + + # Permutation length used by the marlin kernels. + self.perm_len = 1024 + + def __repr__(self) -> str: + return f"MarlinConfig(group_size={self.group_size}" + + @classmethod + def get_name(cls) -> str: + return "marlin" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.half] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig": + group_size = cls.get_from_keys(config, ["group_size"]) + return cls(group_size) + + def get_linear_method(self) -> "MarlinLinearMethod": + return MarlinLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class MarlinLinearMethod(LinearMethodBase): + """Linear method for Marlin. + + Args: + quant_config: The Marlin quantization config. + """ + + def __init__(self, quant_config: MarlinConfig): + self.quant_config = quant_config + + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + del output_size # Unused. + + if params_dtype != torch.float16: + raise ValueError( + f"The params dtype must be float16, but got {params_dtype}") + + # Validate output_size_per_partition + if output_size_per_partition % self.quant_config.min_n_threads != 0: + raise ValueError( + f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}." + ) + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}." + ) + + # Validate input_size_per_partition + if input_size_per_partition % self.quant_config.min_k_threads != 0: + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}." + ) + if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}." + ) + + # Check that we have at least 4 tiles horizontally in the shard + num_tiles_per_perm = self.quant_config.perm_len // ( + self.quant_config.tile_size**2) + if output_size_per_partition % num_tiles_per_perm != 0: + raise ValueError( + "Each permutation group must reside on the same gpu") + + # Quantized 4Bit weights packed into Int32. + qweight = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.tile_size, + output_size_per_partition * self.quant_config.tile_size // + self.quant_config.pack_factor, + device="cuda", + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": self.quant_config.pack_factor, + "marlin_tile_size": self.quant_config.tile_size, + }, + ) + + # Determine if channelwise or not + input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size + + scales = Parameter( + torch.empty( + input_groups, + output_size_per_partition, + device="cuda", + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + scales, + { + "input_dim": None if input_groups == 1 else 0, + "output_dim": 1, + }, + ) + + # Allocate workspace (Used for internal locking mechanism) + max_workspace_size = ( + output_size_per_partition // + self.quant_config.min_n_threads) * self.quant_config.max_parallel + workspace = Parameter(torch.zeros(max_workspace_size, + device="cuda", + dtype=torch.int), + requires_grad=False) + + return { + "B": qweight, + "s": scales, + "workspace": workspace, + } + + def apply_weights( + self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + qweight = weights["B"] + scales = weights["s"] + workspace = weights["workspace"] + + x_2d = x.view(-1, x.shape[-1]) + + size_m = x_2d.shape[0] + size_k = x_2d.shape[1] + size_n = scales.shape[1] + + output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m, + size_n, size_k) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) + + if bias is not None: + output.add_(bias) # In-place add + + return output diff --git a/vllm/model_executor/layers/quantization/smoothquant.py b/vllm/model_executor/layers/quantization/smoothquant.py new file mode 100644 index 0000000..1ea954c --- /dev/null +++ b/vllm/model_executor/layers/quantization/smoothquant.py @@ -0,0 +1,111 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import (LinearMethodBase, + set_weight_attrs) +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size + + +class SmoothQuantConfig(QuantizationConfig): + """Config class for SmoothQuant + Reference: https://github.com/mit-han-lab/smoothquant + """ + + def __init__( + self, + weight_bits: int, + quant_type: str = "tensor" + ) -> None: + self.weight_bits = weight_bits + self.quant_type = quant_type + + if self.weight_bits != 8: + raise ValueError( + "Currently, only w8a8 quantization is supported for " + f"SmoothQuant, but got {self.weight_bits} bits.") + if self.quant_type != "tensor": + raise ValueError( + "Currently, only tensor wise quantization is supported for " + f"SmoothQuant, but got {self.quant_type} type quantization.") + + def __repr__(self) -> str: + return (f"SmoothQuantConfig(weight_bits={self.weight_bits}, " + f"quant_type={self.quant_type})") + + def get_name(self) -> str: + return "smoothquant" + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + return [torch.half, torch.float] + + def get_min_capability(self) -> int: + return 70 + + @staticmethod + def get_config_filenames() -> List[str]: + """List of filenames to search for in the model directory.""" + return [ + "quant_config.json", + "quantize_config.json", + ] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "SmoothQuantConfig": + weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) + quant_type = cls.get_from_keys(config, ["quant_type", "q_type"]) + return cls(weight_bits, quant_type) + + def get_linear_method(self) -> "SmoothLinearMethod": + return SmoothLinearMethod(world_size=get_tensor_model_parallel_world_size()) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class SmoothLinearMethod(LinearMethodBase): + def __init__(self, world_size, *args, **kwargs): + super().__init__(*args, **kwargs) + self.apply_dequant_after_row = world_size > 1 + self.dtpye = None + + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + weight = Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.int8), + requires_grad=False) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + self.dtpye = params_dtype + return {"weight": weight} + + def apply_weights( + self, + weights: Dict[str, torch.Tensor], + x: torch.Tensor, + bias: Optional[torch.Tensor], + scale: Optional[torch.Tensor] = None, + dequant_scale: float = 1.0, + is_row: bool = False, + ) -> torch.Tensor: + x_shape = x.shape + x = x.view(-1, x_shape[-1]) + weight = weights["weight"] + y = torch.empty((x.shape[0], weight.shape[0]),dtype=torch.int32,device=x.device) + ops.linear_a8_w8_o32_(x, weight, y) + y = y.view(*x_shape[:-1], -1) + if is_row and self.apply_dequant_after_row: + # when tp > 1, duquant first(To improve accuracy?) + out = torch.empty_like(y, dtype=self.dtpye) + ops.dequant(out, y, scale, dequant_scale) + y = out + return y diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py new file mode 100644 index 0000000..9244e88 --- /dev/null +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -0,0 +1,129 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import (LinearMethodBase, + set_weight_attrs) +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.utils import is_hip + + +class SqueezeLLMConfig(QuantizationConfig): + """Config class for SqueezeLLM. + + Reference: https://arxiv.org/pdf/2306.07629 + """ + + def __init__( + self, + weight_bits: int, + ) -> None: + self.weight_bits = weight_bits + + if self.weight_bits != 4: + raise ValueError( + "Currently, only 4-bit weight quantization is supported for " + f"SqueezeLLM, but got {self.weight_bits} bits.") + + self.pack_factor = 32 // self.weight_bits + + def __repr__(self) -> str: + return f"SqueezeLLMConfig(weight_bits={self.weight_bits})" + + def get_name(self) -> str: + return "squeezellm" + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + return [torch.half] + + def get_min_capability(self) -> int: + return 70 + + @staticmethod + def get_config_filenames() -> List[str]: + return ["quant_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig": + weight_bits = cls.get_from_keys(config, ["wbits"]) + return cls(weight_bits) + + def get_linear_method(self) -> "SqueezeLLMLinearMethod": + return SqueezeLLMLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class SqueezeLLMLinearMethod(LinearMethodBase): + """Linear method for SqueezeLLM. + + Args: + quant_config: The SqueezeLLM quantization config. + """ + + def __init__(self, quant_config: SqueezeLLMConfig): + self.quant_config = quant_config + + def create_weights(self, input_size_per_partition: int, + output_size_per_partition: int, input_size: int, + output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + if input_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size.") + qweight = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 0, + "pack_factor": self.quant_config.pack_factor, + }) + lookup_table = Parameter( + torch.empty( + output_size, + self.quant_config.weight_bits**2, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(lookup_table, { + "output_dim": 0, + }) + return { + "qweight": qweight, + "lookup_table": lookup_table, + } + + def apply_weights(self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + qweight = weights["qweight"] + lookup_table = weights["lookup_table"] + out_shape = x.shape[:-1] + (qweight.shape[-1], ) + reshaped_x = x.reshape(-1, x.shape[-1]) + if is_hip(): + out_f = torch.zeros(out_shape, dtype=torch.float) + ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table) + out = out_f.to(dtype=torch.float16) + else: + # NOTE: The output tensor should be zero-initialized. + out = torch.zeros(out_shape, dtype=torch.float16) + ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table) + + if bias is not None: + out = out + bias + return out.reshape(out_shape) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py new file mode 100644 index 0000000..3e1cfc7 --- /dev/null +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -0,0 +1,392 @@ +from typing import Tuple, Optional +from functools import cached_property + +import torch +import torch.nn as nn +import torch.jit + + +class RejectionSampler(nn.Module): + """Apply modified rejection sampling as described in "Accelerating Large + Language Model Decoding with Speculative Sampling" + https://arxiv.org/pdf/2302.01318.pdf. + """ + + def __init__(self, strict_mode: bool = False): + """Create a rejection sampler. + + Args: + strict_mode: Whether or not to perform shape/device/dtype checks + during sampling. This catches correctness issues but adds + nontrivial latency. + """ + super().__init__() + self.probs_dtype = torch.float32 + self.token_id_dtype = torch.int64 + self._strict_mode = strict_mode + + # NOTE: A "bonus token" is accepted iff all proposal tokens are + # accepted. There is always only one possible bonus token. We store this + # value in a variable for readability. + self._num_bonus_tokens = 1 + + self.num_accepted_tokens: Optional[torch.Tensor] = None + self.num_emitted_tokens: Optional[torch.Tensor] = None + self.num_draft_tokens: int = 0 + + def init_gpu_tensors(self, rank: int) -> None: + assert self.num_accepted_tokens is None + device = f"cuda:{rank}" + self.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + self.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + + def forward( + self, + target_probs: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> torch.Tensor: + """Sample token ids using rejection sampling. This accepts or rejects + tokens proposed by the draft model using the probability of each token + according to the draft and target models. + + In the worst case where all draft tokens are rejected, it is guaranteed + one correct token will be emitted. + + In the case where all draft tokens are accepted, a bonus token will be + accepted as its cheap to have the target model score this speculative + sequence. + + Args: + target_probs: The probability distribution over token ids given + context according to the target model. + shape = [batch_size, num_speculative_tokens, vocab_size] + + bonus_token_ids: The "bonus" token ids that are accepted iff all + speculative tokens in a sequence are accepted. + shape = [batch_size, num_bonus_tokens] + + draft_probs: The probability distribution over token ids given + context according to the draft model. + shape = [batch_size, num_speculative_tokens, vocab_size] + + draft_token_ids: The token ids that were sampled from the draft + probabilities. + shape = [batch_size, num_speculative_tokens] + + Returns: + output_token_ids: The token ids sampled via rejection sampling, + or -1 if unable to sample a token because the previous token + was rejected. + shape = [batch_size, num_speculative_tokens + num_bonus_tokens] + """ + # Only perform shape/dtype/device checking in strict mode, as it adds + # overhead. + if self._strict_mode: + self._raise_if_incorrect_shape(target_probs, bonus_token_ids, + draft_probs, draft_token_ids) + self._raise_if_incorrect_dtype(target_probs, bonus_token_ids, + draft_probs, draft_token_ids) + self._raise_if_inconsistent_device(target_probs, bonus_token_ids, + draft_probs, draft_token_ids) + self._raise_if_out_of_bounds_vocab(target_probs.shape[-1], + bonus_token_ids, + draft_token_ids) + + accepted, recovered_token_ids = self._batch_modified_rejection_sampling( + target_probs, + draft_probs, + draft_token_ids, + ) + + output_token_ids = self._create_output( + accepted, + recovered_token_ids, + draft_token_ids, + bonus_token_ids, + ) + return output_token_ids + + def _batch_modified_rejection_sampling( + self, + target_probs: torch.Tensor, # [batch_size, k, vocab_size] + draft_probs: torch.Tensor, # [batch_size, k, vocab_size] + draft_token_ids: torch.Tensor, # [batch_size, k] + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Perform modified rejection sampling on each sequence. + + Returns: + A tuple of two tensors: + 0: A bool tensor of which tokens in each sequence is accepted. + shape = [batch_size, k] + 1: Token ids sampled from a recovered distribution, to be used + when a token is rejected. + shape = [batch_size, k] + """ + + batch_size, k, vocab_size = draft_probs.shape + + # shape [batch_size, k] + accepted = self._get_accepted(target_probs, draft_probs, + draft_token_ids) + + recovered_probs = self._get_recovered_probs( + target_probs, draft_probs).reshape(batch_size * k, vocab_size) + + recovered_token_ids = _multinomial(recovered_probs, + num_samples=1).reshape( + batch_size, k) + return accepted, recovered_token_ids + + def _get_accepted( + self, + target_probs: torch.Tensor, # [batch_size, k, vocab_size] + draft_probs: torch.Tensor, # [batch_size, k, vocab_size] + draft_token_ids: torch.Tensor, # [batch_size, k] + ) -> torch.Tensor: + r"""Create bool matrix over the proposed draft tokens. If + True, then a token can be accepted, else it should be + rejected. + + Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of + :math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according + to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the + same conditional probability according to the draft model, the token + is accepted with probability: + + .. math:: + \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} + {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) + + This implementation does not apply causality. When using the output, + if a token is rejected, subsequent tokens should not be used. + + Returns a bool tensor of shape [batch_size, k] specifying which tokens + are accepted. + """ + batch_size, k, _ = draft_probs.shape + batch_indices = torch.arange(batch_size, + device=target_probs.device)[:, None] + probs_indicies = torch.arange(k, device=target_probs.device) + + # shape [batch_size, k] + selected_draft_probs = draft_probs[batch_indices, probs_indicies, + draft_token_ids] + + # shape [batch_size, k] + selected_target_probs = target_probs[batch_indices, probs_indicies, + draft_token_ids] + + uniform_rand = torch.rand(batch_size, + k, + dtype=self.probs_dtype, + device=target_probs.device) + capped_ratio = torch.minimum( + selected_target_probs / selected_draft_probs, + torch.full((1, ), 1, device=target_probs.device)) + accepted = uniform_rand < capped_ratio + + return accepted + + def _get_recovered_probs( + self, + target_probs: torch.Tensor, # [k, vocab_size] + draft_probs: torch.Tensor, # [k, vocab_size] + ) -> torch.Tensor: + r"""Create a probability distribution for each proposed token which can + be sampled if the proposed token is rejected. + + When this routine is applied sequentially, the true distribution of the + target model is recovered (within hardware numerics). + + The probability distribution used in this rejection case is constructed + as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of + :math:`x` given context :math:`x_1, \dots, x_n` according to the target + model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability + according to the draft model: + + .. math:: + x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ + + where :math:`(f(x))_+` is defined as: + + .. math:: + (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} + + See https://github.com/vllm-project/vllm/pull/2336 for a visualization + of the draft, target, and recovered probability distributions. + + Returns a tensor of shape [batch_size, k, vocab_size]. + + Note: This batches operations on GPU and thus constructs the recovered + distribution for all tokens, even if they are accepted. This causes + division-by-zero errors, so we use self._smallest_positive_value to + avoid that. This introduces some drift to the distribution. + """ + _, k, _ = draft_probs.shape + + # shape [batch_size, k, vocab_size] + difference = target_probs - draft_probs + + # TODO(cade): Can we use logprobs instead of probs, and avoid the + # division-by-zero errors without introducing distribution drift? + + # shape [batch_size, k, vocab_size] + f = torch.clamp(difference, min=self._smallest_positive_value) + + # shape [batch_size, k, vocab_size] + recovered_probs = f / torch.sum(f, dim=-1).reshape(-1, k, 1) + + return recovered_probs + + @cached_property + def _smallest_positive_value(self) -> float: + """Return the smallest positive value representable by the probs dtype. + This value is used when constructing a distribution from which to sample + recovered tokens in the first rejection case. + + See _get_recovered_probs for more details + + Note that this isn't actually the smallest positive value representable + by float32, but the smallest positive normal value. + See https://en.wikipedia.org/wiki/Subnormal_number for more information. + """ + return torch.finfo(self.probs_dtype).tiny + + def _create_output( + self, + accepted: torch.Tensor, # [batch_size, k] + recovered_token_ids: torch.Tensor, # [batch_size, k] + draft_token_ids: torch.Tensor, # [batch_size, k] + bonus_token_ids: torch.Tensor, # [batch_size] + ) -> torch.Tensor: + """Format output. Returns a matrix of token ids. When + a token is rejected via rejection sampling, all subsequent + token ids are set to -1 for the sequence. + + shape = [batch_size, k + num_bonus_tokens] + """ + bonus_token_ids = bonus_token_ids.squeeze() + batch_size, k = recovered_token_ids.shape + + # Determine the index of the first False value for each row. + limits = (accepted == 0).max(1).indices + limits[~(accepted == 0).any(1)] = k + + # Create masks using the indices. + indices = torch.arange(k, device=accepted.device).unsqueeze(0) + accepted_mask = indices < limits.unsqueeze(1) + after_false_mask = indices == limits.unsqueeze(1) + + # Create an extended output tensor + output_with_bonus_tokens = -torch.ones( + (batch_size, k + self._num_bonus_tokens), + dtype=self.token_id_dtype, + device=accepted.device) + output = output_with_bonus_tokens[:, :k] + + # Fill in the first k columns of the output tensor using masks and data + # tensors. + output[:, :k] = torch.where(accepted_mask, draft_token_ids, + -torch.ones_like(draft_token_ids)) + + # Fill the last column. + # We check output directly as accepted may have True values inconsistent + # with causal acceptance. + output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, + bonus_token_ids, -1) + + # Fill the recovered token ids. + output.mul_(~after_false_mask).add_( + recovered_token_ids.mul(after_false_mask)) + + self.num_accepted_tokens += accepted.sum() + self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() + self.num_draft_tokens += batch_size * k + + return output_with_bonus_tokens + + def _raise_if_incorrect_shape( + self, + target_probs: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> None: + (target_batch_size, num_target_probs, + target_vocab_size) = target_probs.shape + bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape + draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape + draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape + + assert draft_batch_size == target_batch_size + assert num_draft_probs == num_target_probs + assert (draft_vocab_size == target_vocab_size + ), f"{draft_vocab_size=} {target_vocab_size=}" + + assert draft_token_ids_batch_size == draft_batch_size + assert num_draft_token_ids == num_draft_probs + + assert bonus_batch_size == target_batch_size + assert num_bonus_tokens == self._num_bonus_tokens + + def _raise_if_incorrect_dtype( + self, + target_probs: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> None: + assert all(probs.dtype == self.probs_dtype + for probs in [target_probs, draft_probs]) + assert all(token_ids.dtype == self.token_id_dtype + for token_ids in [bonus_token_ids, draft_token_ids]) + + def _raise_if_inconsistent_device( + self, + target_probs: torch.Tensor, + bonus_token_ids: torch.Tensor, + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> None: + devices = [ + t.device for t in + [target_probs, bonus_token_ids, draft_probs, draft_token_ids] + ] + assert all([devices[0] == device for device in devices]) + + def _raise_if_out_of_bounds_vocab( + self, + vocab_size: int, + bonus_token_ids: torch.Tensor, + draft_token_ids: torch.Tensor, + ) -> None: + assert torch.all(bonus_token_ids < vocab_size) + assert torch.all(bonus_token_ids >= 0) + assert torch.all(draft_token_ids < vocab_size) + assert torch.all(draft_token_ids >= 0) + + +# torch.multinomial forces a GPU<->CPU sync. +# Therefore, we use an optimized implementation instead that skips the sync. +# Note that we always sample with replacement. +# probs will be modified in place, but this is fine, as we pass +# in a copy already. +@torch.jit.script +def _multinomial( + probs: torch.Tensor, + num_samples: int, +) -> torch.Tensor: + if num_samples > 1: + # This is equivalent to torch.repeat_interleaved (which also + # forces a GPU<->CPU sync). + probs = probs[:, None, :].expand(probs.shape[0], num_samples, + probs.shape[1]).contiguous().view( + -1, probs.shape[1]) + q = torch.empty_like(probs).exponential_(1.0) + return probs.div_(q).argmax(dim=1).view(-1, num_samples) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py new file mode 100644 index 0000000..933b24e --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -0,0 +1,562 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Rotary Positional Embeddings.""" +import math +from typing import Any, Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn + +from vllm._C import ops + + +def _rotate_neox(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +class RotaryEmbedding(nn.Module): + """Original rotary positional embedding.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + + cache = self._compute_cos_sin_cache() + cache = cache.to(torch.get_default_dtype()) + self.register_buffer("cos_sin_cache", cache, persistent=False) + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. + # However, we use `torch.arange(..., dtype=torch.float)` instead to + # avoid numerical issues with large base values (e.g., 10000000). + # This may cause a slight numerical difference between the HF + # implementation and ours. + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / (base**(torch.arange( + 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + t = torch.arange(self.max_position_embeddings, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + def _forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + + query_rot = query[..., :self.rotary_dim] + key_rot = key[..., :self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim:] + key_pass = key[..., self.rotary_dim:] + + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj + query_rot = query_rot * cos + rotate_fn(query_rot) * sin + key_rot = key_rot * cos + rotate_fn(key_rot) * sin + + if self.rotary_dim < self.head_size: + query = torch.cat((query_rot, query_pass), dim=-1) + key = torch.cat((key_rot, key_pass), dim=-1) + else: + query = query_rot + key = key_rot + query = query.flatten(-2) + key = key.flatten(-2) + return query, key + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # ops.rotary_embedding() is an in-place operation that + # updates the query and key tensors. + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + + +class LinearScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with linear scaling. + + Credits to the Reddit user /u/kaiokendev + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + ) -> None: + self.scaling_factor = scaling_factor + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.base) + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * self.scaling_factor + t = torch.arange(max_len, dtype=torch.float) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + +class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK scaling. + + Credits to the Reddit users /u/bloc97 and /u/emozilla + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + ) -> None: + self.scaling_factor = scaling_factor + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * self.scaling_factor + base = self.base * ( + (self.scaling_factor * max_len / self.max_position_embeddings) - + (self.scaling_factor - 1))**(self.rotary_dim / + (self.rotary_dim - 2)) + inv_freq = self._compute_inv_freq(base) + t = torch.arange(max_len, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + +# Inverse dim formula to find dim based on number of rotations +def _yarn_find_correction_dim(num_rotations: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048) -> float: + return (dim * math.log(max_position_embeddings / + (num_rotations * 2 * math.pi))) / (2 * + math.log(base)) + + +# Find dim range bounds based on rotations +def _yarn_find_correction_range(low_rot: int, + high_rot: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048) -> int: + low = math.floor( + _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil( + _yarn_find_correction_dim(high_rot, dim, base, + max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def _yarn_linear_ramp_mask(low: float, high: float, dim: int, + dtype: torch.dtype) -> torch.Tensor: + if low == high: + high += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def _yarn_get_mscale(scale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + +class YaRNScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: float = 32, + beta_slow: float = 1, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation + self.mscale = float( + _yarn_get_mscale(self.scaling_factor) * attn_factor) + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base**( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / + self.rotary_dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, + self.rotary_dim, self.base, + self.max_position_embeddings) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = (1 - _yarn_linear_ramp_mask( + low, high, self.rotary_dim // 2, + dtype=torch.float)) * self.extrapolation_factor + inv_freq = inv_freq_interpolation * ( + 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange(self.max_position_embeddings * self.scaling_factor, + dtype=torch.float32) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = (freqs.cos() * self.mscale) + sin = (freqs.sin() * self.mscale) + cache = torch.cat((cos, sin), dim=-1) + return cache + + +_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} + + +def get_rope( + head_size: int, + rotary_dim: int, + max_position: int, + base: int, + is_neox_style: bool = True, + rope_scaling: Optional[Dict[str, Any]] = None, +) -> RotaryEmbedding: + key = (head_size, rotary_dim, max_position, base, is_neox_style, + tuple(rope_scaling.items()) if rope_scaling is not None else None) + if key in _ROPE_DICT: + return _ROPE_DICT[key] + + if rope_scaling is None: + rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) + else: + scaling_type = rope_scaling[ + "type"] if "type" in rope_scaling else rope_scaling["rope_type"] + scaling_factor = rope_scaling["factor"] + if scaling_type == "llama3": + dtype = torch.get_default_dtype() + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, dtype, + scaling_factor, low_freq_factor, + high_freq_factor, + original_max_position) + elif scaling_type == "linear": + rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, + scaling_factor) + elif scaling_type == "dynamic": + rotary_emb = DynamicNTKScalingRotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, + scaling_factor) + elif scaling_type == "yarn": + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("extrapolation_factor", "attn_factor", "beta_fast", + "beta_slow") + } + rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim, + original_max_position, + base, is_neox_style, + scaling_factor, + **extra_kwargs) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + _ROPE_DICT[key] = rotary_emb + return rotary_emb + + +# ↓ add for smoothquant +class DequantRotaryEmbedding(RotaryEmbedding): + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + q_dequant_scale: float, + k_dequant_scale: float, + v_dequant_scale: float + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # pos_encoding_ops.rotary_embedding() is an in-place operation that + # updates the query and key tensors. + query_dequant = torch.empty_like(query, dtype=self.cos_sin_cache.dtype) + key_dequant = torch.empty_like(key, dtype=self.cos_sin_cache.dtype) + value_dequant = torch.empty_like(value, dtype=self.cos_sin_cache.dtype) + + ops.dequant(value_dequant, value, None, v_dequant_scale) + ops.dequant_rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + query_dequant, + key_dequant, + q_dequant_scale, + k_dequant_scale, + self.is_neox_style, + ) + return query_dequant, key_dequant, value_dequant + +class Llama3RotaryEmbedding(RotaryEmbedding): + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + scaling_factor: float, + low_freq_factor: float, + high_freq_factor: float, + orig_max_position: int, + ) -> None: + self.scaling_factor = scaling_factor + self.low_freq_factor = low_freq_factor + self.high_freq_factor = high_freq_factor + self.orig_max_position = orig_max_position + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style) + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + inv_freqs = super()._compute_inv_freq(base) + low_freq_wavelen = self.orig_max_position / self.low_freq_factor + high_freq_wavelen = self.orig_max_position / self.high_freq_factor + + wave_len = 2 * math.pi / inv_freqs + if self.low_freq_factor != self.high_freq_factor: + smooth = (self.orig_max_position / wave_len - self.low_freq_factor + ) / (self.high_freq_factor - self.low_freq_factor) + else: + smooth = 0 + new_freqs = torch.where( + wave_len < high_freq_wavelen, + inv_freqs, + torch.where( + wave_len > low_freq_wavelen, + inv_freqs / self.scaling_factor, + (1 - smooth) * inv_freqs / self.scaling_factor + + smooth * inv_freqs, + ), + ) + return new_freqs + +class DequantLinearScalingRotaryEmbedding(LinearScalingRotaryEmbedding, + DequantRotaryEmbedding): + + def __init__(self, *args, **kwargs): + LinearScalingRotaryEmbedding.__init__(self, *args, **kwargs) + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + dequant_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + return DequantRotaryEmbedding.forward(self, positions, query, key, + value, dequant_scale) + +class DequantDynamicNTKScalingRotaryEmbedding(DynamicNTKScalingRotaryEmbedding, + DequantRotaryEmbedding): + + def __init__(self, *args, **kwargs): + DynamicNTKScalingRotaryEmbedding.__init__(self, *args, **kwargs) + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + dequant_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + return DequantRotaryEmbedding.forward(self, positions, query, key, + value, dequant_scale) + +class DequantYaRNScalingRotaryEmbedding(YaRNScalingRotaryEmbedding, + DequantRotaryEmbedding): + + def __init__(self, *args, **kwargs): + YaRNScalingRotaryEmbedding.__init__(self, *args, **kwargs) + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + dequant_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + return DequantRotaryEmbedding.forward(self, positions, query, key, + value, dequant_scale) + +_DEQUANT_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} + + +def get_dequant_rope( + head_size: int, + rotary_dim: int, + max_position: int, + base: int, + is_neox_style: bool = True, + rope_scaling: Optional[Dict[str, Any]] = None, +) -> RotaryEmbedding: + key = (head_size, rotary_dim, max_position, base, is_neox_style, + tuple(rope_scaling.items()) if rope_scaling is not None else None) + if key in _DEQUANT_ROPE_DICT: + return _DEQUANT_ROPE_DICT[key] + + if rope_scaling is None: + rotary_emb = DequantRotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) + else: + scaling_type = rope_scaling["type"] + scaling_factor = rope_scaling["factor"] + if scaling_type == "linear": + rotary_emb = DequantLinearScalingRotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, + scaling_factor) + elif scaling_type == "dynamic": + rotary_emb = DequantDynamicNTKScalingRotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, + scaling_factor) + elif scaling_type == "yarn": + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("extrapolation_factor", "attn_factor", "beta_fast", + "beta_slow") + } + rotary_emb = DequantYaRNScalingRotaryEmbedding(head_size, rotary_dim, + original_max_position, + base, is_neox_style, + scaling_factor, + **extra_kwargs) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + _DEQUANT_ROPE_DICT[key] = rotary_emb + return rotary_emb \ No newline at end of file diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py new file mode 100644 index 0000000..48db579 --- /dev/null +++ b/vllm/model_executor/layers/sampler.py @@ -0,0 +1,598 @@ +"""A layer that samples the next tokens from the model's outputs.""" +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn + +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_gather,tensor_model_parallel_all_gather) +from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput, + SequenceData, SequenceGroupOutput, SequenceOutput) +from vllm.utils import is_neuron +import ixformer.functions as ixf_F + + +class Sampler(nn.Module): + """Samples the next tokens from the model's outputs. + + This layer does the following: + 1. Discard the hidden states that are not used for sampling (i.e., all + tokens except the final one in each prompt). + 2. Compute the logits for the next tokens. + 3. Apply presence, frequency and repetition penalties. + 4. Apply temperature scaling. + 5. Apply top-p and top-k truncation. + 6. Sample the next tokens. + Here, each sequence group within the batch can have different sampling + parameters (e.g., sampling method, temperature, top-p, top-k, etc.). + """ + + def __init__(self, + vocab_size: int, + org_vocab_size: Optional[int] = None) -> None: + super().__init__() + self.vocab_size = vocab_size + # Transformers-neuronx generate outputs as logits directly. + self.logits_as_hidden_states = is_neuron() + # original vocabulary size (without LoRA). + self.org_vocab_size = org_vocab_size or vocab_size + + def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor, + embedding_bias: Optional[torch.Tensor], + logits_scale = None) -> torch.Tensor: + # Get the logits for the next tokens. + if logits_scale is None: + logits = ixf_F.linear(hidden_states, embedding) + else: + logits = ixf_F.linear(hidden_states / logits_scale, embedding) + # TODO align + """ + logits = torch.matmul(hidden_states, embedding.t()) + """ + if embedding_bias is not None: + logits += embedding_bias + logits = tensor_model_parallel_all_gather(logits) + # TODO align + """ + logits = tensor_model_parallel_gather(logits) + """ + # Remove paddings in vocab (if any). + if logits is not None: + logits = logits[:, :self.org_vocab_size] + return logits + + def forward( + self, + embedding: torch.Tensor, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + embedding_bias: Optional[torch.Tensor] = None, + logits_scale = None, + ) -> Optional[SamplerOutput]: + # Get the hidden states that we use for sampling. + if self.logits_as_hidden_states: + logits = hidden_states + else: + hidden_states = _prune_hidden_states(hidden_states, + sampling_metadata) + + # Get the logits for the next tokens. + logits = self._get_logits(hidden_states, embedding, embedding_bias, logits_scale) + + # Only perform sampling in the driver worker. + # Note: `_get_logits` is still distributed across TP workers because + # the `embedding` weight is distributed across TP workers. + # TODO(zhuohan): Change the get_logits part to a separate stage. + if not sampling_metadata.perform_sampling: + return None + + assert logits is not None + _, vocab_size = logits.shape + + # Apply logits processors (if any). + logits = _apply_logits_processors(logits, sampling_metadata) + + # Prepare sampling tensors with pinned memory to avoid blocking. + (sampling_tensors, do_penalties, do_top_p_top_k, + do_min_p) = SamplingTensors.from_sampling_metadata( + sampling_metadata, vocab_size, logits.device, logits.dtype) + + # Apply presence and frequency penalties. + if do_penalties: + logits = _apply_penalties(logits, sampling_tensors.prompt_tokens, + sampling_tensors.output_tokens, + sampling_tensors.presence_penalties, + sampling_tensors.frequency_penalties, + sampling_tensors.repetition_penalties) + + # Apply temperature scaling. + # Use in-place division to avoid creating a new tensor. + logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1)) + + if do_top_p_top_k: + logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks) + + if do_min_p: + logits = _apply_min_p(logits, sampling_tensors.min_ps) + + # We use float32 for probabilities and log probabilities. + # Compute the probabilities. + probs = torch.softmax(logits, dim=-1, dtype=torch.float) + # Compute the log probabilities. + # Use log_softmax to ensure numerical stability. + logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) + + # Sample the next tokens. + sample_results = _sample(probs, logprobs, sampling_metadata) + # Get the logprobs query results. + prompt_logprobs, sample_logprobs = _get_logprobs( + logprobs, sampling_metadata, sample_results) + return _build_sampler_output(sample_results, sampling_metadata, + prompt_logprobs, sample_logprobs) + + +def _prune_hidden_states( + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, +) -> torch.Tensor: + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + return hidden_states.index_select(0, + sampling_metadata.selected_token_indices) + + +def _get_bin_counts_and_mask( + tokens: torch.Tensor, + vocab_size: int, + num_seqs: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + # Compute the bin counts for the tokens. + # vocab_size + 1 for padding. + bin_counts = torch.zeros((num_seqs, vocab_size + 1), + dtype=torch.long, + device=tokens.device) + bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) + bin_counts = bin_counts[:, :vocab_size] + mask = bin_counts > 0 + + return bin_counts, mask + + +def _apply_logits_processors( + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, +) -> torch.Tensor: + logits_row_idx = 0 + found_logits_processors = False + for seq_ids, sampling_params in sampling_metadata.seq_groups: + logits_processors = sampling_params.logits_processors + if logits_processors: + found_logits_processors = True + for seq_id in seq_ids: + logits_row = logits[logits_row_idx] + token_ids = sampling_metadata.seq_data[seq_id].output_token_ids + for logits_processor in logits_processors: + logits_row = logits_processor(token_ids, logits_row) + logits[logits_row_idx] = logits_row + logits_row_idx += 1 + else: + logits_row_idx += len(seq_ids) + if found_logits_processors: + assert logits_row_idx == logits.shape[0] + return logits + + +def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, + output_tokens_tensor: torch.Tensor, + presence_penalties: torch.Tensor, + frequency_penalties: torch.Tensor, + repetition_penalties: torch.Tensor) -> torch.Tensor: + num_seqs, vocab_size = logits.shape + _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size, + num_seqs) + output_bin_counts, output_mask = _get_bin_counts_and_mask( + output_tokens_tensor, vocab_size, num_seqs) + + repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size) + repetition_penalties[~(prompt_mask | output_mask)] = 1.0 + logits = torch.where(logits > 0, logits / repetition_penalties, + logits * repetition_penalties) + + # We follow the definition in OpenAI API. + # Refer to https://platform.openai.com/docs/api-reference/parameter-details + logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts + logits -= presence_penalties.unsqueeze_(dim=1) * output_mask + return logits + + +def _apply_top_k_top_p( + logits: torch.Tensor, + p: torch.Tensor, + k: torch.Tensor, +) -> torch.Tensor: + logits_sort, logits_idx = logits.sort(dim=-1, descending=False) + + # Apply top-k. + top_k_mask = logits_sort.size(1) - k.to(torch.long) + # Get all the top_k values. + top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) + top_k_mask = logits_sort < top_k_mask + logits_sort.masked_fill_(top_k_mask, -float("inf")) + + # Apply top-p. + probs_sort = logits_sort.softmax(dim=-1) + probs_sum = probs_sort.cumsum(dim=-1) + top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) + # at least one + top_p_mask[:, -1] = False + logits_sort.masked_fill_(top_p_mask, -float("inf")) + + # Re-sort the probabilities. + src = torch.arange(logits_idx.shape[-1], + device=logits_idx.device).expand_as(logits_idx) + logits_idx_inv = torch.empty_like(logits_idx).scatter_(dim=-1, + index=logits_idx, + src=src) + logits = torch.gather(logits_sort, dim=-1, index=logits_idx_inv) + return logits + + +def _apply_min_p( + logits: torch.Tensor, + min_p: torch.Tensor, +) -> torch.Tensor: + """ + Adapted from + https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17 + """ + probs = torch.softmax(logits, dim=-1) + top_probs, _ = probs.max(dim=-1, keepdim=True) + scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs + tokens_to_remove = probs < scaled_min_p + logits = logits.masked_fill_(tokens_to_remove, -float("inf")) + + return logits + + +def _greedy_sample( + selected_seq_groups: List[Tuple[List[int], SamplingParams]], + samples: torch.Tensor, +) -> List[Tuple[List[int], List[int]]]: + samples = samples.tolist() + sample_idx = 0 + results = [] + for seq_group in selected_seq_groups: + seq_ids, _ = seq_group + num_parent_seqs = len(seq_ids) + assert num_parent_seqs == 1, ( + "Greedy sampling should have only one seq.") + parent_ids = list(range(num_parent_seqs)) + next_token_ids = [samples[sample_idx]] + results.append((next_token_ids, parent_ids)) + sample_idx += num_parent_seqs + return results + + +def _random_sample( + selected_seq_groups: List[Tuple[List[int], SamplingParams]], + is_prompts: List[bool], + random_samples: torch.Tensor, +) -> List[Tuple[List[int], List[int]]]: + # Find the maximum best_of value of the prompt phase requests. + random_samples = random_samples.cpu() + sample_idx = 0 + results = [] + for seq_group, is_prompt in zip(selected_seq_groups, is_prompts): + seq_ids, sampling_params = seq_group + num_parent_seqs = len(seq_ids) + if is_prompt: + # Prompt phase. + parent_ids = [0] * sampling_params.best_of + next_token_ids = random_samples[ + sample_idx, :sampling_params.best_of].tolist() + else: + # Generation phase. + parent_ids = list(range(num_parent_seqs)) + next_token_ids = random_samples[sample_idx:sample_idx + + num_parent_seqs, 0].tolist() + results.append((next_token_ids, parent_ids)) + sample_idx += num_parent_seqs + return results + + +def _beam_search_sample( + selected_seq_groups: List[Tuple[List[int], SamplingParams]], + is_prompts: List[bool], + seq_data: Dict[int, SequenceData], + logprobs: torch.Tensor, +) -> List[Tuple[List[int], List[int]]]: + # We sample 2 * beam_width candidates to make sure that with high + # probability we can get `beam_width` candidates in addition to + # the finished sequences for the next iteration. See + # https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563 + # for details. See also HF reference: + # https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065 + # + # NOTE: Beam search is not vectorized, so its speed can be slower than + # other sampling methods. + sample_idx = 0 + results = [] + for seq_group, is_prompt in zip(selected_seq_groups, is_prompts): + seq_ids, sampling_params = seq_group + num_parent_seqs = len(seq_ids) + beam_width = sampling_params.best_of + seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs] + if is_prompt: + # Prompt phase. + assert num_parent_seqs == 1, ( + "Prompt input should have only one seq.") + parent_ids = [0] * (2 * beam_width) + _, next_token_ids = torch.topk(seq_group_logprobs[0], + 2 * beam_width) + next_token_ids = next_token_ids.tolist() + else: + # Generation phase. + cumulative_logprobs = [ + seq_data[seq_id].cumulative_logprob for seq_id in seq_ids + ] + cumulative_logprobs = torch.tensor( + cumulative_logprobs, + dtype=torch.float, + device=seq_group_logprobs.device) + seq_group_logprobs = (seq_group_logprobs + + cumulative_logprobs.unsqueeze(dim=1)) + _, topk_ids = torch.topk(seq_group_logprobs.flatten(), + 2 * beam_width) + topk_ids = topk_ids.tolist() + vocab_size = seq_group_logprobs.size(-1) + parent_ids = [i // vocab_size for i in topk_ids] + next_token_ids = [i % vocab_size for i in topk_ids] + results.append((next_token_ids, parent_ids)) + sample_idx += num_parent_seqs + assert sample_idx == logprobs.size(0) + return results + + +# torch.multinomial forces a GPU<->CPU sync. +# Therefore, we use an optimized implementation instead. +# Note that we always sample with replacement. +# probs will be modified in place, but this is fine, as we pass +# in a copy already. +def _multinomial( + probs: torch.Tensor, + num_samples: int, + seq_groups: Optional[List[Tuple[List[int], SamplingParams]]] = None, + generators: Optional[List[torch.Generator]] = None, +) -> torch.Tensor: + if num_samples > 1: + # This is equivalent to torch.repeat_interleaved (which also + # forces a GPU<->CPU sync). + # This allows us to do sampling with replacement by creating + # num_samples copies of each row in the tensor, and then + # batch sampling the resulting tensor. + probs = probs[:, None, :].expand(probs.shape[0], num_samples, + probs.shape[1]).contiguous().view( + -1, probs.shape[1]) + q = torch.empty_like(probs) + if seq_groups is None: + q.exponential_() + else: + sample_idx = 0 + for (seq_ids, _), generator in zip(seq_groups, generators): + next_sample_idx = sample_idx + len(seq_ids) * num_samples + q[sample_idx:next_sample_idx].exponential_(generator=generator) + sample_idx = next_sample_idx + return probs.div_(q).argmax(dim=1).view(-1, num_samples) + + +def _sample( + probs: torch.Tensor, + logprobs: torch.Tensor, + sampling_metadata: SamplingMetadata, +) -> List[Tuple[List[int], List[int]]]: + categorized_seq_group_ids = {t: [] for t in SamplingType} + categorized_sample_indices = sampling_metadata.categorized_sample_indices + for i, seq_group in enumerate(sampling_metadata.seq_groups): + _, sampling_params = seq_group + sampling_type = sampling_params.sampling_type + categorized_seq_group_ids[sampling_type].append(i) + + sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} + sample_metadata = {} + multinomial_samples = {} + + # Counterintiutively, having two loops here is actually faster. + # The first loop can run without waiting on GPU<->CPU sync. + for sampling_type in SamplingType: + sample_indices = categorized_sample_indices[sampling_type] + num_tokens = len(sample_indices) + if num_tokens == 0: + continue + seq_group_ids = categorized_seq_group_ids[sampling_type] + seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids] + is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] + sample_metadata[sampling_type] = (seq_group_ids, seq_groups, + is_prompts, sample_indices) + if sampling_type == SamplingType.GREEDY: + greedy_samples = torch.argmax(logprobs[sample_indices.long()], + dim=-1) + elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): + max_best_of = 1 + for seq_group, is_prompt in zip(seq_groups, is_prompts): + if is_prompt: + _, sampling_params = seq_group + max_best_of = max(max_best_of, sampling_params.best_of) + seeded_args = {} if sampling_type == SamplingType.RANDOM else { + "seq_groups": seq_groups, + "generators": sampling_metadata.generators, + } + multinomial_samples[sampling_type] = _multinomial( + probs[sample_indices.long()], max_best_of, **seeded_args) + elif sampling_type == SamplingType.BEAM: + beam_search_logprobs = logprobs[sample_indices] + else: + raise ValueError(f"Unsupported sampling type: {sampling_type}") + + # GPU<->CPU sync happens in the loop below. + + for sampling_type in SamplingType: + if sampling_type not in sample_metadata: + continue + seq_group_ids, seq_groups, is_prompts, sample_indices = sample_metadata[ + sampling_type] + if sampling_type == SamplingType.GREEDY: + sample_results = _greedy_sample(seq_groups, greedy_samples) + elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): + sample_results = _random_sample(seq_groups, is_prompts, + multinomial_samples[sampling_type]) + elif sampling_type == SamplingType.BEAM: + sample_results = _beam_search_sample(seq_groups, is_prompts, + sampling_metadata.seq_data, + beam_search_logprobs) + sample_results_dict.update(zip(seq_group_ids, sample_results)) + + sample_results = [ + sample_results_dict[i] + for i in range(len(sampling_metadata.seq_groups)) + ] + return sample_results + + +def _get_logprobs( + logprobs: torch.Tensor, + sampling_metadata: SamplingMetadata, + sample_results: List[Tuple[List[int], List[int]]], +) -> Tuple[List[Optional[List[Optional[Dict[int, float]]]]], List[List[Dict[ + int, float]]]]: + # Prepare query indices + batched_logprobs_query_seq_indices: List[int] = [] + batched_logprobs_query_token_indices: List[int] = [] + largest_num_logprobs = 0 + sample_idx = 0 + for i, (seq_group, sample_result) in enumerate( + zip(sampling_metadata.seq_groups, sample_results)): + seq_ids, sampling_params = seq_group + next_token_ids, parent_ids = sample_result + num_parent_seqs = len(seq_ids) + if (i < sampling_metadata.num_prompts + and sampling_params.prompt_logprobs is not None): + largest_num_logprobs = max(largest_num_logprobs, + sampling_params.prompt_logprobs) + prompt_len = sampling_metadata.prompt_lens[i] + prompt_tokens = sampling_metadata.seq_data[ + seq_ids[0]].prompt_token_ids + batched_logprobs_query_seq_indices.extend( + sample_idx + j for j in range(prompt_len - 1)) + batched_logprobs_query_token_indices.extend( + token_id for token_id in prompt_tokens[1:]) + sample_idx += prompt_len - 1 + batched_logprobs_query_seq_indices.extend( + [sample_idx + parent_id for parent_id in parent_ids]) + batched_logprobs_query_token_indices.extend(next_token_ids) + if sampling_params.logprobs is not None: + largest_num_logprobs = max(largest_num_logprobs, + sampling_params.logprobs) + sample_idx += num_parent_seqs + assert sample_idx == logprobs.size(0) + + # Batched query for logprobs of selected token + batched_logprobs_query_result = logprobs[[ + batched_logprobs_query_seq_indices, + batched_logprobs_query_token_indices + ]] + + # Batched query for logprobs of topk tokens + if largest_num_logprobs > 0: + top_logprobs, top_token_ids = torch.topk(logprobs, + largest_num_logprobs, + dim=-1) + top_logprobs = top_logprobs.cpu() + top_token_ids = top_token_ids.cpu() + else: + top_logprobs, top_token_ids = None, None + + batched_logprobs_query_result = batched_logprobs_query_result.cpu() + + # Gather results + result_prompt_logprobs: List[Optional[PromptLogprobs]] = [] + result_sample_logprobs: List[SampleLogprobs] = [] + sample_idx = 0 + query_result_idx = 0 + for i, (seq_group, sample_result) in enumerate( + zip(sampling_metadata.seq_groups, sample_results)): + seq_ids, sampling_params = seq_group + next_token_ids, parent_ids = sample_result + + # Prompt logprobs + if (i < sampling_metadata.num_prompts + and sampling_params.prompt_logprobs is not None): + num_logprobs = sampling_params.prompt_logprobs + prompt_len = sampling_metadata.prompt_lens[i] + prompt_tokens = sampling_metadata.seq_data[ + seq_ids[0]].prompt_token_ids + group_prompt_logprobs: PromptLogprobs = [None] + for token_id in prompt_tokens[1:]: + prompt_logprobs_dict = { + token_id: + batched_logprobs_query_result[query_result_idx].item() + } + if num_logprobs > 0: + prompt_logprobs_dict.update( + zip(top_token_ids[sample_idx, :num_logprobs].tolist(), + top_logprobs[sample_idx, :num_logprobs].tolist())) + group_prompt_logprobs.append(prompt_logprobs_dict) + sample_idx += 1 + query_result_idx += 1 + result_prompt_logprobs.append(group_prompt_logprobs) + else: + result_prompt_logprobs.append(None) + + # Sample logprobs + num_logprobs = sampling_params.logprobs + if num_logprobs is None: + num_logprobs = 0 + group_sample_logprobs: SampleLogprobs = [] + for next_token_id, parent_id in zip(next_token_ids, parent_ids): + sample_logprobs_dict = { + next_token_id: + batched_logprobs_query_result[query_result_idx].item() + } + query_result_idx += 1 + if num_logprobs > 0: + sample_logprobs_dict.update( + zip( + top_token_ids[sample_idx + + parent_id, :num_logprobs].tolist(), + top_logprobs[sample_idx + + parent_id, :num_logprobs].tolist())) + group_sample_logprobs.append(sample_logprobs_dict) + result_sample_logprobs.append(group_sample_logprobs) + sample_idx += len(seq_ids) + + return result_prompt_logprobs, result_sample_logprobs + + +def _build_sampler_output( + sample_results: List[Tuple[List[int], List[int]]], + sampling_metadata: SamplingMetadata, + prompt_logprobs: List[Optional[PromptLogprobs]], + sample_logprobs: List[SampleLogprobs], +) -> SamplerOutput: + sampler_output = [] + for (seq_group, sample_result, group_prompt_logprobs, + group_sample_logprobs) in zip(sampling_metadata.seq_groups, + sample_results, prompt_logprobs, + sample_logprobs): + seq_ids, _ = seq_group + next_token_ids, parent_ids = sample_result + seq_outputs = [] + for parent_id, next_token_id, logprobs in zip(parent_ids, + next_token_ids, + group_sample_logprobs): + seq_outputs.append( + SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) + sampler_output.append( + SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) + return sampler_output diff --git a/vllm/model_executor/layers/triton_kernel/__init__.py b/vllm/model_executor/layers/triton_kernel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/model_executor/layers/triton_kernel/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/triton_kernel/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77875beea1084935532bfb06c08fef209b0cf923 GIT binary patch literal 185 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;x_aera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_er|qBYEFD=MQU;M1& literal 0 HcmV?d00001 diff --git a/vllm/model_executor/layers/triton_kernel/__pycache__/prefix_prefill.cpython-310.pyc b/vllm/model_executor/layers/triton_kernel/__pycache__/prefix_prefill.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a5f9afcd6388758681a7301bde76794c7c5111f GIT binary patch literal 10306 zcmeHNTXP#ncAoAT00R&NNeCiETDIkjBtjA|k}OeI-b9KLsmtyvaW>dE6A&{bL4W`_ zGo&b|C!54hRbHHOUa~JqmH4RlJmfFrFFzV#r)LhoqiT0|Lc#CFuYF_ncPvHu4i#E|aTKO;Ill`q6_b^fTySSO zCd+(7&1p=r*m}g|rHK7yNKsR@Vy0$k>$(|{7$scM&6ugbtG&Z=@lVu>n+d6v_(ZKP zGby#YEah=>SBW#T$Luy!?=pc&AarMO+LID5yOnOL|xuk)>Ez8WqRIsf^-{p;k4SIJwKc@lZYi z`K&l4j%R3AYCXQo2Nds|7!~KR8OpD5Q(57I%77wzG79GI&5(SS+NhPHaY;TIWLOaq!=hVSdP$7Kp2UzMQZ3~(u!c!%E{n^1 zp5+BRie8xFQb1x{~yqFU6tuw%rr^H2S6SI(SiV1PEt$jkSk0!1**QaG2 zo9nY6rXKf*h4o&tS)dozWlAi_b(s)z-bFFvUD*7sy(=uza_0^;cdkCR$Vkht1p1BI z!&1M$so&V?0jYmgFxpA)mf&QucN=yj#8~6haJ&mtZ&7;vqBv#Zd}2hL*x&@b^#>jN z{}#^gB|JTBZLUFmUexS8kq7dV77s|Yo$q}z+U7N2K6Kg4V8$t+HFJl`W&X;;UxW^&<~4 zAtu_$@A#^HOpn2{LEGfG=d22WxTLu z8_RjOu;%L|u32!Pm?L*;!|$T1(soeW*HASI?v6ow)>7qIo3iR?Q>$1p^5wD-3|6d| z){fs3Zn7L7d+JB4D=Uss3GeV-S>Ay6udEoGKHG#l8bx0%6XXkpx@{GoFINIBIx0}! z_F2j2nJm&T9oxO&Y~X; zMbl_Y^CTl}{yf2{HjQ@vyr1)D{VJy(AN5DHAEsk+1k?_x{3GZ_?dh}N$oc5nIHPe$t$$M=Y78Pg+< zmwFe(R1nFBvHJ|-6~yu09HN!E))5e=A2cSP7t@b>#r%37V)E%QCZ87bVN8AlG5OR_ zFebkl#^eifecNO5TQVlU)gdPTy$;@d^M7AIKZtXPeLd3la8hMA*0C zv5WBLv{?M{5w;C(MtO3`PUB`DBFGRNCg>*^AUHxWNH9e30>M#&7YU95C>VFw?3bwg zGQleZ6b9R`0y98t6u~sX48be`QC0RF!92lDf)0_Oy+94O2yPQB63~az zzC%F7mwk`mKEW3VzDV#Tf(HaYMex%EZxZ|r!IuesmLNy)6@s55c#Gf{2!5WxAjlIG z2uuQtV1;0nV2z+iuueb`R{&*5{$NX7WUQ3)&YF>T-HNe&?)TB`r2pHaq$DDyB*Th*^ zO$;VjTu4EAq``uM-=o!NJTdrBFsuRFxtDiHVvK<4#2Br8X#6_c*~V2QQ4hWomQM!r z)EN~4{uA>AjHjHZVS!{i$8l~#L?DB?0albFtgm^E1m zOkZoe7%eH&;4vFB%wmSw3B{WcSH&>yV0yuw&B& zL$3#%DRHznLp|`NvTZ>uhzYFuP3*-jvEbbX^LxA1B3K(R=$e>nu4{e2f)xqZby3VC zQd^Xf+Pt7%aWO9zO{7}hQ(^lham$k|>gJQZo+7}VV%H^G+U6;ecY~Mg*>wT#6#9}a zZ9AD@(eNZwU{K)&-V&lUSWTQ6)_c5hE&~SjE^2?wxe_Dcxw-!x} z*b?3&*RU!98@&HS?0qwQtS4gS;E9cuYxT24+q-7p9N=18JQUCPf88_hglir!>igdi zd)NHHUJ9P}D>w}kt)~I`hf5v46ku)frI0*rn?E9Z1pm7yy2U-*@8e9Q#eIz0`Fqh- z=WesffV;KvbyRGffWDYEu`zZxK@UMM;AWQP5}?4U4-I$a{5iXi8Y+P77e0O1m`l!p zR4N)yxmvTF{~tkQt}EYs=3IAJkmTX!-eT#KLprl-iYfK~ZO8*iVAt7WzN>gi zku=$$BL9~{x2aZ`sd?Q(Sy#;n(&8+<^Wd(kT=i03kJl?wqNl{|VbLRcOWZlNsQfAu zJ!W)UamGnDVlA5=K^(%zwZNzQ><1k4$c7KdEzaRi}b_by*i zy~7w&D=|A~a#+T_e$izrTkL&iYi?8=9#mE_>Tfc^aB*B3t_ZFuu0C8bTsp2et}a{= zSdth-50kW2T+zo!#qUyolHyWQ8g(Q!>$+qsYqXLSw2~CG@pa)5^$Cnh5!3(_yS3f?{-M=0E4Hl%_T>Vq7 z;^!;awu0GIn6#vC-qT+um?n6R-y~&}p3*n(=`n!q5;y?*(;dI>O@+b-Q}Vm{6elzI zcs`|`*=Q$9!jIGc)_It>YfdgfA~H!xpJd~H1Q`=#thB?wwOqrrwu=P|DY9CA zxmYf`kC3HY%Oh zUwAkFXJnQBN$=s`*L(T*^gjNNdYb=(eu)3Qp5cF|ALf6n_w&Ee2l!L{2>)w+kbhSn z;(w*T!2eP|%Kt)tk$*>jiT}C&GXFFE82>7`UzyaPg0%sEuO_a z{HK3i8`qJTI+(ROtmUD#`e$UVc1+Ixf&ME06a6**$7Hc~oa(f1bF#Q@=^tvN_PgZ5 z>|Qoy|03= "2.1.0": + + @triton.jit + def _fwd_kernel( + Q, + K, + V, + K_cache, + V_cache, + B_Loc, + sm_scale, + B_Start_Loc, + B_Seqlen, + B_Ctxlen, + block_size, + x, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs, + stride_qh, + stride_qd, + stride_kbs, + stride_kh, + stride_kd, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_k_cache_bs, + stride_k_cache_h, + stride_k_cache_d, + stride_k_cache_bl, + stride_k_cache_x, + stride_v_cache_bs, + stride_v_cache_h, + stride_v_cache_d, + stride_v_cache_bl, + num_queries_per_kv: int, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + ): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + offs_d[None, :] * stride_qd) + + q = tl.load( + Q + off_q, + mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + # # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + for start_n in range(0, cur_batch_ctx_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + + ((start_n + offs_n) // block_size) * stride_b_loc_s, + mask=(start_n + offs_n) < cur_batch_ctx_len, + other=0) + off_k = (bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * + stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x) + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) + k = tl.load(K_cache + off_k, + mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, + float("-inf")) + qk *= sm_scale + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + p_scale = beta / l_i_new + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(V_cache + off_v, + mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len, + other=0.0) + + p = p.to(v.dtype) + acc += tl.dot(p, v) + # # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd) + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd) + k_ptrs = K + off_k + v_ptrs = V + off_v + + block_mask = tl.where( + block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) + + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(k_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=(start_n + offs_n[None, :]) < + cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk *= sm_scale + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, + float("-inf")) + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + p_scale = beta / l_i_new + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(v_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=(start_n + offs_n[:, None]) < + cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + offs_d[None, :] * stride_od) + out_ptrs = Out + off_o + tl.store(out_ptrs, + acc, + mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len) + return + + @triton.jit + def _fwd_kernel_flash_attn_v2( + Q, + K, + V, + K_cache, + V_cache, + B_Loc, + sm_scale, + B_Start_Loc, + B_Seqlen, + B_Ctxlen, + block_size, + x, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs, + stride_qh, + stride_qd, + stride_kbs, + stride_kh, + stride_kd, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_k_cache_bs, + stride_k_cache_h, + stride_k_cache_d, + stride_k_cache_bl, + stride_k_cache_x, + stride_v_cache_bs, + stride_v_cache_h, + stride_v_cache_d, + stride_v_cache_bl, + num_queries_per_kv: int, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + ): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + offs_d[None, :] * stride_qd) + + q = tl.load( + Q + off_q, + mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + # # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + for start_n in range(0, cur_batch_ctx_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + + ((start_n + offs_n) // block_size) * stride_b_loc_s, + mask=(start_n + offs_n) < cur_batch_ctx_len, + other=0) + off_k = (bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * + stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x) + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) + k = tl.load(K_cache + off_k, + mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, + float("-inf")) + qk *= sm_scale + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(V_cache + off_v, + mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len, + other=0.0) + + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd) + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd) + k_ptrs = K + off_k + v_ptrs = V + off_v + + block_mask = tl.where( + block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) + + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(k_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=(start_n + offs_n[None, :]) < + cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk *= sm_scale + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, + float("-inf")) + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(v_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=(start_n + offs_n[:, None]) < + cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + # acc /= l_i[:, None] + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + offs_d[None, :] * stride_od) + out_ptrs = Out + off_o + tl.store(out_ptrs, + acc, + mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len) + return + + @triton.jit + def _fwd_kernel_alibi( + Q, + K, + V, + K_cache, + V_cache, + B_Loc, + sm_scale, + B_Start_Loc, + B_Seqlen, + B_Ctxlen, + Alibi_slopes, + block_size, + x, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs, + stride_qh, + stride_qd, + stride_kbs, + stride_kh, + stride_kd, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_k_cache_bs, + stride_k_cache_h, + stride_k_cache_d, + stride_k_cache_bl, + stride_k_cache_x, + stride_v_cache_bs, + stride_v_cache_h, + stride_v_cache_d, + stride_v_cache_bl, + num_queries_per_kv: int, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + ): + # attn_bias[] + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + # cur_batch_seq_len: the length of prompts + # cur_batch_ctx_len: the length of prefix + # cur_batch_in_all_start_index: the start id of the dim=0 + cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + offs_d[None, :] * stride_qd) + + q = tl.load( + Q + off_q, + mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + # # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + alibi_slope = tl.load(Alibi_slopes + cur_head) + alibi_start_q = tl.arange( + 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len + alibi_start_k = 0 + for start_n in range(0, cur_batch_ctx_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + + ((start_n + offs_n) // block_size) * stride_b_loc_s, + mask=(start_n + offs_n) < cur_batch_ctx_len, + other=0) + off_k = (bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * + stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x) + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) + k = tl.load(K_cache + off_k, + mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, + float("-inf")) + qk *= sm_scale + + # load alibi + alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - + alibi_start_q[:, None]) * alibi_slope + alibi = tl.where( + (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), + alibi, float("-inf")) + qk += alibi + alibi_start_k += BLOCK_N + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(V_cache + off_v, + mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len, + other=0.0) + + p = p.to(v.dtype) + acc += tl.dot(p, v, allow_tf32=False) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd) + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd) + k_ptrs = K + off_k + v_ptrs = V + off_v + + block_mask = tl.where( + block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) + + # init alibi + alibi_slope = tl.load(Alibi_slopes + cur_head) + alibi_start_q = tl.arange( + 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len + alibi_start_k = cur_batch_ctx_len + # # init debugger + # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc + # offset_db_k = tl.arange(0, BLOCK_N) + # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL] + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(k_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=(start_n + offs_n[None, :]) < + cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k, allow_tf32=False) + qk *= sm_scale + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, + float("-inf")) + + # load alibi + alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - + alibi_start_q[:, None]) * alibi_slope + alibi = tl.where( + (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), + alibi, float("-inf")) + qk += alibi + alibi_start_k += BLOCK_N + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(v_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=(start_n + offs_n[:, None]) < + cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) + + p = p.to(v.dtype) + acc += tl.dot(p, v, allow_tf32=False) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + acc = acc / l_i[:, None] + + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + offs_d[None, :] * stride_od) + out_ptrs = Out + off_o + tl.store(out_ptrs, + acc, + mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len) + return + + @torch.inference_mode() + def context_attention_fwd(q, + k, + v, + o, + k_cache, + v_cache, + b_loc, + b_start_loc, + b_seq_len, + b_ctx_len, + max_input_len, + alibi_slopes=None): + + cap = torch.cuda.get_device_capability() + BLOCK = 128 if cap[0] >= 8 else 64 + # shape constraints + Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] + assert Lq == Lk and Lk == Lv + assert Lk in {16, 32, 64, 128} + + sm_scale = 1.0 / (Lq**0.5) + batch, head = b_seq_len.shape[0], q.shape[1] + num_queries_per_kv = q.shape[1] // k.shape[1] + + grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, + + num_warps = 8 if Lk <= 64 else 8 + if alibi_slopes is not None: + _fwd_kernel_alibi[grid]( + q, + k, + v, + k_cache, + v_cache, + b_loc, + sm_scale, + b_start_loc, + b_seq_len, + b_ctx_len, + alibi_slopes, + v_cache.shape[3], + 8, + o, + b_loc.stride(0), + b_loc.stride(1), + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + k_cache.stride(0), + k_cache.stride(1), + k_cache.stride(2), + k_cache.stride(3), + k_cache.stride( + 4 + ), #[num_blocks, num_kv_heads, head_size/x, block_size, x] + v_cache.stride(0), + v_cache.stride(1), + v_cache.stride(2), + v_cache.stride( + 3), #[num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, + BLOCK_M=BLOCK, + BLOCK_DMODEL=Lk, + BLOCK_N=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + return + + _fwd_kernel[grid]( + q, + k, + v, + k_cache, + v_cache, + b_loc, + sm_scale, + b_start_loc, + b_seq_len, + b_ctx_len, + v_cache.shape[3], + 8, + o, + b_loc.stride(0), + b_loc.stride(1), + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + k_cache.stride(0), + k_cache.stride(1), + k_cache.stride(2), + k_cache.stride(3), + k_cache.stride( + 4), #[num_blocks, num_kv_heads, head_size/x, block_size, x] + v_cache.stride(0), + v_cache.stride(1), + v_cache.stride(2), + v_cache.stride( + 3), #[num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, + BLOCK_M=BLOCK, + BLOCK_DMODEL=Lk, + BLOCK_N=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + return diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py new file mode 100644 index 0000000..6d13cf8 --- /dev/null +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -0,0 +1,151 @@ +from typing import Optional, Sequence + +import torch +import torch.nn.functional as F +from torch.nn.parameter import Parameter + +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.model_executor.parallel_utils.utils import divide +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_reduce) +from vllm.model_executor.utils import set_weight_attrs + +DEFAULT_VOCAB_PADDING_SIZE = 64 + + +def pad_vocab_size(vocab_size: int, + pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: + """Pad the vocab size to the given value.""" + return ((vocab_size + pad_to - 1) // pad_to) * pad_to + + +def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int, + rank: int) -> Sequence[int]: + index_f = rank * per_partition_vocab_size + index_l = index_f + per_partition_vocab_size + return index_f, index_l + + +def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, + world_size: int) -> Sequence[int]: + per_partition_vocab_size = divide(global_vocab_size, world_size) + return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, + rank) + + +class VocabParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the vocabulary dimension. + + Adapted from torch.nn.Embedding, note that we pad the vocabulary size to + make sure it is divisible by the number of model parallel GPUs. + + Args: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + params_dtype: type of the parameters. + org_num_embeddings: original vocabulary size (without LoRA). + padding_size: padding size for the vocabulary. + """ + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE): + super().__init__() + + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.org_vocab_size = org_num_embeddings or num_embeddings + self.num_embeddings_padded = pad_vocab_size(num_embeddings, + padding_size) + self.embedding_dim = embedding_dim + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.tp_size = get_tensor_model_parallel_world_size() + # Divide the weight matrix along the vocaburaly dimension. + self.vocab_start_index, self.vocab_end_index = ( + vocab_range_from_global_vocab_size( + self.num_embeddings_padded, get_tensor_model_parallel_rank(), + self.tp_size)) + self.num_embeddings_per_partition = (self.vocab_end_index - + self.vocab_start_index) + self.weight = Parameter( + torch.empty(self.num_embeddings_per_partition, + self.embedding_dim, + dtype=params_dtype)) + set_weight_attrs(self.weight, { + "parallel_dim": 0, + "weight_loader": self.weight_loader + }) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + parallel_dim = param.parallel_dim + assert loaded_weight.shape[parallel_dim] == self.org_vocab_size + loaded_weight = loaded_weight[self.vocab_start_index:self. + vocab_end_index] + param[:loaded_weight.shape[0]].data.copy_(loaded_weight) + + def forward(self, input_): + if self.tp_size > 1: + # Build the mask. + input_mask = ((input_ < self.vocab_start_index) | + (input_ >= self.vocab_end_index)) + # Mask the input. + masked_input = input_.clone() - self.vocab_start_index + masked_input[input_mask] = 0 + else: + masked_input = input_ + # Get the embeddings. + output_parallel = F.embedding(masked_input, self.weight) + # Mask the output embedding. + if self.tp_size > 1: + output_parallel[input_mask, :] = 0.0 + # Reduce across all the model parallel GPUs. + output = tensor_model_parallel_all_reduce(output_parallel) + return output + + +class ParallelLMHead(VocabParallelEmbedding): + """Parallelized LM head. + + Output logits weight matrices used in the Sampler. The weight and bias + tensors are padded to make sure they are divisible by the number of + model parallel GPUs. + + Args: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + bias: whether to use bias. + params_dtype: type of the parameters. + org_num_embeddings: original vocabulary size (without LoRA). + padding_size: padding size for the vocabulary. + """ + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + bias: bool = False, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE): + super().__init__(num_embeddings, embedding_dim, params_dtype, + org_num_embeddings, padding_size) + if bias: + self.bias = Parameter( + torch.empty(self.num_embeddings_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "parallel_dim": 0, + "weight_loader": self.weight_loader + }) + else: + self.register_parameter("bias", None) + + def forward(self, input_): + del input_ + raise RuntimeError("LMHead's weights should be used in the sampler.") diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py new file mode 100644 index 0000000..5848fc0 --- /dev/null +++ b/vllm/model_executor/model_loader.py @@ -0,0 +1,137 @@ +"""Utilities for selecting and loading models.""" +import contextlib +from typing import Type + +import torch +import torch.nn as nn + +from vllm.config import DeviceConfig, ModelConfig +from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.weight_utils import (get_quant_config, + initialize_dummy_weights) + + +@contextlib.contextmanager +def _set_default_torch_dtype(dtype: torch.dtype): + """Sets the default torch dtype to the given dtype.""" + old_dtype = torch.get_default_dtype() + torch.set_default_dtype(dtype) + yield + torch.set_default_dtype(old_dtype) + + +def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]: + architectures = getattr(model_config.hf_config, "architectures", []) + # Special handling for quantized Mixtral. + # FIXME(woosuk): This is a temporary hack. + if (model_config.quantization is not None + and "MixtralForCausalLM" in architectures): + architectures = ["QuantMixtralForCausalLM"] + + for arch in architectures: + model_cls = ModelRegistry.load_model_cls(arch) + if model_cls is not None: + return model_cls + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {ModelRegistry.get_supported_archs()}") + + +def get_model(model_config: ModelConfig, device_config: DeviceConfig, + **kwargs) -> nn.Module: + lora_config = kwargs.get("lora_config", None) + model_class = _get_model_architecture(model_config) + + # Get the (maybe quantized) linear method. + linear_method = None + if model_config.quantization is not None: + quant_config = get_quant_config(model_config) + capability = (9, 0) + # capability = torch.cuda.get_device_capability() avoid capability error + capability = capability[0] * 10 + capability[1] + if capability < quant_config.get_min_capability(): + raise ValueError( + f"The quantization method {model_config.quantization} is not " + "supported for the current GPU. " + f"Minimum capability: {quant_config.get_min_capability()}. " + f"Current capability: {capability}.") + supported_dtypes = quant_config.get_supported_act_dtypes() + if model_config.dtype not in supported_dtypes: + raise ValueError( + f"{model_config.dtype} is not supported for quantization " + f"method {model_config.quantization}. Supported dtypes: " + f"{supported_dtypes}") + linear_method = quant_config.get_linear_method() + + with _set_default_torch_dtype(model_config.dtype): + # Create a model instance. + # The weights will be initialized as empty tensors. + try: + # with torch.device contextmanager need torch >= 2.0 + with torch.device(device_config.device): + if hasattr(model_class, "supported_lora_modules"): + model = model_class(model_config.hf_config, linear_method, + lora_config) + elif lora_config: + raise ValueError( + f"Model {model_class.__name__} does not support LoRA, " + "but LoRA is enabled. Support for this model may " + "be added in the future. If this is important to you, " + "please open an issue on github.") + else: + model = model_class(model_config.hf_config, linear_method) + if model_config.load_format == "dummy": + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) + else: + # Load the weights from the cached or downloaded files. + model.load_weights(model_config.model, model_config.download_dir, + model_config.load_format, model_config.revision) + # for torch < 2.0 + except: + if hasattr(model_class, "supported_lora_modules"): + model = model_class(model_config.hf_config, linear_method, + lora_config) + elif lora_config: + raise ValueError( + f"Model {model_class.__name__} does not support LoRA, " + "but LoRA is enabled. Support for this model may " + "be added in the future. If this is important to you, " + "please open an issue on github.") + else: + model = model_class(model_config.hf_config, linear_method) + model = model.cuda() + if model_config.load_format == "dummy": + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) + else: + # Load the weights from the cached or downloaded files. + model.load_weights(model_config.model, model_config.download_dir, + model_config.load_format, model_config.revision) + return model.eval() + # TODO align + """ + with torch.device(device_config.device): + if hasattr(model_class, "supported_lora_modules"): + model = model_class(model_config.hf_config, linear_method, + lora_config) + elif lora_config: + raise ValueError( + f"Model {model_class.__name__} does not support LoRA, " + "but LoRA is enabled. Support for this model may " + "be added in the future. If this is important to you, " + "please open an issue on github.") + else: + model = model_class(model_config.hf_config, linear_method) + if model_config.load_format == "dummy": + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) + else: + # Load the weights from the cached or downloaded files. + model.load_weights(model_config.model, model_config.download_dir, + model_config.load_format, model_config.revision) + return model.eval() + """ \ No newline at end of file diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py new file mode 100644 index 0000000..22e0fde --- /dev/null +++ b/vllm/model_executor/models/__init__.py @@ -0,0 +1,107 @@ +import importlib +from typing import List, Optional, Type + +import torch.nn as nn + +from vllm.logger import init_logger +from vllm.utils import is_hip, is_neuron + +logger = init_logger(__name__) + +# Architecture -> (module, class). +_MODELS = { + "AquilaModel": ("llama", "LlamaForCausalLM"), + "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 + "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b + "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b + "BloomForCausalLM": ("bloom", "BloomForCausalLM"), + "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), + "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), + "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), + "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), + "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), + "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), + "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), + "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), + "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), + "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), + "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), + "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), + "SQLlamaForCausalLM": ("llama_smooth","LlamaForCausalLM"), + "CPMDragonflyForCausalLM": ("cpm", "CPMDragonflyForCausalLM"), + + # For decapoda-research/llama-* + "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), + "MistralForCausalLM": ("llama", "LlamaForCausalLM"), + "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), + "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), + # transformers's mpt class has lower case + "MptForCausalLM": ("mpt", "MPTForCausalLM"), + "MPTForCausalLM": ("mpt", "MPTForCausalLM"), + "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"), + "OPTForCausalLM": ("opt", "OPTForCausalLM"), + "OrionForCausalLM": ("orion", "OrionForCausalLM"), + "PhiForCausalLM": ("phi", "PhiForCausalLM"), + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), + "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), + "RWForCausalLM": ("falcon", "FalconForCausalLM"), + "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), + "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), + "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), +} + +# Models not supported by ROCm. +_ROCM_UNSUPPORTED_MODELS = [] + +# Models partially supported by ROCm. +# Architecture -> Reason. +_ROCM_PARTIALLY_SUPPORTED_MODELS = { + "Qwen2ForCausalLM": + "Sliding window attention is not yet supported in ROCm's flash attention", + "MistralForCausalLM": + "Sliding window attention is not yet supported in ROCm's flash attention", + "MixtralForCausalLM": + "Sliding window attention is not yet supported in ROCm's flash attention", +} + +# Models not supported by Neuron. +_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"} + + +class ModelRegistry: + + @staticmethod + def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: + if model_arch not in _MODELS: + return None + if is_hip(): + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError( + f"Model architecture {model_arch} is not supported by " + "ROCm for now.") + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + logger.warning( + f"Model architecture {model_arch} is partially supported " + "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) + elif is_neuron(): + if model_arch not in _NEURON_SUPPORTED_MODELS: + raise ValueError( + f"Model architecture {model_arch} is not supported by " + "Neuron for now.") + + module_name, model_cls_name = _MODELS[model_arch] + if is_neuron(): + module_name = _NEURON_SUPPORTED_MODELS[model_arch] + module = importlib.import_module( + f"vllm.model_executor.models.{module_name}") + return getattr(module, model_cls_name, None) + + @staticmethod + def get_supported_archs() -> List[str]: + return list(_MODELS.keys()) + + +__all__ = [ + "ModelRegistry", +] diff --git a/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..406cbaddf19d95bdf6a844cf119ae980adb2b5c7 GIT binary patch literal 3244 zcmZuzX>T0286I-Dv!lIqN0QH#d}ZT$mB@*kq^WCNmJ{qsvLrX{puk|Xq;@BsW6j}O zUO@(mD*wPA`XMo37s!Wx>E9_jy_=$KfZliALw(6v**kV-G0&3E`|#yU@+CQ8v6$1~ zIr;7HH_OAC_NlD&KLje*;EgP#X;h=g(=7RqJ>+8x`?{t3#3CrMWqiXjR9W}3zG<1N zOuPXXU#MOUL4YCJ$RtI(taP|ndjV1X6^OLP!$NZ~LYf$}IF101I> z0Z!0Kz$tnR@HjmIc#@t1JWb1h)5_-zJ@WuTKWA0#96b+~3-o2cieg?-c#(bqm`k(@ zI7?pzd`;mTy$t2o=^KDo=obOsq;CPfO}_;A4*fFVRTaxu=(Pvv6GX4kdAjg1{sdX2 zre+P&MfF?ySht2ChGiF7!z&uSPFFtE=*j`MM(9ctS)&wfYLy$G$yr~m=rO6ek%-N; zws6D1@nW*oZLbTVCCoUbX?D=hm?7s$atRP zJMmCW{#_6GqSJ{SuU5B)6I~w_k1jawVyoi>eI&Rrw;i`3X)d!wsZ=$k;(4&~f)orb zcwy+LtX5`Qa=Zw3tFYK|#7eE+3(?9ppx*R+YourBYigbD@73Qbr5IV{< zCC6dYXz}36`qo^neuFvmXlSI_7IS<=TF1e<;5HXwUD9#nL1S;b%?4Qb=#iuM(xQwM zeZa@0c?3MIvhZtZg((#Q3-9r95Lu(|27>V*H8Jtn5jmG`JK!p@=g;w%pfMki9!55A zKA$u&;DX90JMzO&w3INOSX{3!aiKSZ}m_J|Vpg)oMsx&^KL)Pj!L?dF z>|?n%5BjrDfR<)>AzH{WNl zP^rARv&#ZLtvug^;&bbjk>)u*19q#p@qV9UYjRUK+a9acm)l{Z)n|dRn31xD0?fLo=e@JS>AO68t#@IAy^y zCdyH#-43~6wCo1ujkQJpLR8-IoT&9oUeVRi_#`-0i?~-v4!5dGz6Q{#CkwS}@J0&& zOtTQxEX;I@7@?SDn8C6im{Z!6wnL~+$%h24o&m69mSY51EH5F&=4lp=VKXkO4ZOjc zGDW*_&X=dD4fD~84bDV|2MxFhpFZ+_%)q+{Z}c62eNAY05jUu|sR?wT@e9DA(%G1U%q7^JbfH>yTimcw!dYrwEr zd2i-uv7Rl|c2_M_c_-wsjJwsnX{l*DT)2+sb^DgIr+mA-clLALtg1WenWN)(Jxt}ipzbGPGSQnVrB zj>qC`lL<(2ULpLP^pQ_^J8*nvm6BmKJh+;n9vg}AyBfa?+9#{CoruqRVZ-re!S!sr zD_UW2x%%3xvk|PurMAsl!c^K%Q%-HQh=L))01(l1I-<`} zuR_9@wwi0(j&pWzk)T|#<%QtMRgg6}Ps8O%40bqI~i0dTpAV;lsr0bRlB@;4+a zyXS98kgL`QGHQMn;0C;r{4z2%j87pvHHj2o8RJ!8V?%V?GRu^wr#)FFqYC~0{$y0;=hm*{xccGe9b5h1XBh&b&WCs6)oWVaPXYr57Ir2k%9wYMyH-=x9- literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc b/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5d4b25e320bd6313abe514aba06ffb65bc7dbac GIT binary patch literal 10074 zcmbta>2n;%b)S1?XAdk69-w%vDC$_r1VmD?V_A}UNVF)C5Or9Nr{o$Bb_T!#vj=-- zd2AMCN1!9dDSt>+5+`v|MSSGHkV;klizL-wlB#?OD<55{lq4Scy*E3H1-O)x%AmTp z`}HwB-S7S0(dd-Ru7c;tum5@Vh3_fKzf)!WF;F>+l>Aavlu+@MPz|+M^;G=ovDVT( zy=8bt%k)e&YqLBHZANUhY|mzz89S|lSKzW0yRD*Ez zkG76^$67V7#`UTA+17FIIG3yObFCBJiPrPp^IV^fUueDPy{IaY8CmNuQCiXF6|Wx7 zL@$T4TsOkGaNljid*ug8xIa8_TL}+D_WHNBbnhf62g7+#=7*F8P!5I9fbz_cvIxrI z@CYbJhLl%9IT{`V<=BvN3Y1#-EGW-Lr!enn)Q^YHq5d4#&xG2?O8rFt{H69vB%*dB zI@xK*w`%8u=7qIh(5|&QVHDRIomMwUo6B)jyU|S7YVY<|SDWqCi$Mc~sJXh9CJXoJ zq?hWtG%h!jR5~AY(`Kg~#M1nv*NvmPDz$bSdG)G@QV}%UQFx)#UTLm^R=(8k_R=d+ z8iYX_fZ%@IjC<$W;gw#DTIFi68inW5G-}hFpx7T>`FN=#T6IHKFJr1eVDMTeJRc;H zJai=z=zpOT_gd|%fe7L_inCsF`os4>-AnqYb7MCNFK||))EAvDt=s-MXyFS)U6cDi z?KFbry!Sh;k&nPslZ)=-q87LdNXpYYpssnWn zZ<;u#nHFfwKs~5zX`5F528^-hlmoO@dXM=lW79^-9T@yJb6{_4t6FGrTws{&V7*e-Ascp2 z5wBqWua*{jiCB!m%<*E}Twd(ng7mb{ES!3EF#*$@>;{bu$Xv2`J&s$88UIE%qec%O zl~t3)<)GOhj9BR2lG8r1bAYLuezvgFcun#uvN_dI4<xW*;Z5~)xfFRDx_pi3AKR%?l%UCucrpMAKX2n zm;-zWq~L|r3Q_)z5?X(!Z>gWFslBCOgcB!Cj86<)>+cOeiX8+(1 z+exq6?T9oAYglpfI)J;=e!bRzcA@qGVLNrJHIv#-x0Q~lwL3Q!>IHCnuL~$~d>=fN z`o1V(YFS#P&FFRguIQ|b3Xuw7=SG|B()~1ud(k^Wbc7(tkQUzSue9qX1SX1C#3G1- zctM<^j8=P3l?C5##6gnaCB&2BTa^7CvMWf*VPvkYsl*VQ4ymr4#8rKI)e#?Ejd zUv7isIDg38pu@O;l)Q{=pxgyqY^$FEkT!MHa46gACO`y$meEzv)sL|}Xfp9_j1k|Z z>^sQnhIk!6QoZTf!~KxzaP6ZHkTy2+Mj*mvbWLbCD)@v7IJ`eQzTjam@g_~|(tfvt zR^d!3V40P#tNR4?Y9Q@;=*a0w){*7n7pk6K&=+Z` zXCc{<89I>_7EpF0J8~e7B^JkGSb{hf!gA!U7sCqYQ&B0bM&)ptYAbFy6VAdOs<5a| zfz$WN>3r8l>c9Uctb(+8lJpkwcy1QdRz#;&dvVmSb}6Zyi{sjlJH6Uc6ooI=x&e6m ze)UB_y;6g*!i*^kJ4$e-+X4JRT>NOH+Fh~&{B#Y*i9FB>ZUU0`_Q?dWL8>x;$|%t3 z(@`a-p)7X5fRvUGg}9|`+7N%m2NbLu1A9MI)`pfGcu)BpJ&1CMMBb$S>jsyt0l3z; z2Lv9@z(ih1U2eB{1o{C?EWiuuelaZ#sZfXvGt@$3%b?o;8cp=m3fAEa98kc=?yakROu{J zsYO{YAUUDKt#d@D(dfBvY`n6*Z;QT=N7oBWtYt5XGuT9Fp1XYM{3US|hTjDHMSMV$eo7gm0^`F7c4N?!pCWS$ zmWn@Z2B@V$Czmv?s{jg6=#PQ;E6vHk8KC^YwJpXuSFNfp`e%Jg4>dCzVW3aRazS?= z_YXZG(<1BCnu8M0T*l00fR?!=NPFFXj0c{81uLn&k%;4xDM-{i|?Z()oW7Skm_~M-pDq_vxht66^Y&aPG;*{ z*>9$0ch3AQ=D3HHRFPHeIh@~&>Zo&>wwizPjD|W3it$~H5SNfG)vMwhRnAjJ+bwP1 z4?7L0ptp(i4w1+%4${49>ji4EK4BM0+V6jQfzDPCh_Xf#7pcU{p`i&MW@uzu ztkulWVj~lpZoL4_M|mM~q5Fzq5j|0#SHeAnfN837WYD}F zHQ;rN%fYQkEP;L4OU?j+V4y*EP5zL!e+vX@LFU_APxAE)dEvF;O;Q ze1hi(DzF;{D71+VXNmNNmDcE z4GNDd?g|$)SrV|c!;g*A(w1<#6IKXn3-1=|l(i|_K$h?AgaM*MopxY>)z?|j{sk!^jCBgw zE!DuDIdCUwPwCHkU^ql^HAn3qp16~{Z1Sl+xF%??7|tf)mr$Sri>R^|VOTTu~c z<-#g2GFG`EkZKoBP04b;c~>YlxU@$AcA*n(gNdZm!sv*1(CH4@e%c*no$NgWzhupt zsYN9udyDYJNeg$~FO)CTFSHe%FzmLI7SUSTQr}j#;IW}kIjvBCq$$$ppi~)yjHO+% zEDdj`BP9Eb9iUY#;3p2^SIQmbj&?`g(*I1m23BR>s;j)q(B9g@tE_GB>Dh-80%zjR zJV=Kp-8ab4qaf*9226c+`z%G%+{f1A{>!_zYb34G=>;*lJt5Yx0W0#I3E?p_N z2vC3j5A5ScN7Sf_-#T^zaUG9hubQs4Z?# z^(L~qn<;`m5g4)lkV+(nne9f1CDo=hd3TwcCIOgBtaAu`UE6>qmfO~(BoGGqx(aH4 z-|pS}eb^vzA?&_l9zmSrbkWE0W*&MMfL=#3<7L9s3TKz3r$I$9v zj#iNgE6HFB3t>H2Sa26{r{N&JK)Dma#sX3mU}Yi7ubWJDFr zW5If>qHU_67N&uT^SFc2aR*~WI_8)|d0$puarhp_^r~!^O+#@Vk_YlVr^|(l9dRM( zB|&`o%F>S3e**Lf_!#uwMqvl^$W`}Aab~eUR-)|i0{CE7*+>opAG`GcLF!*5ILnx@ zv06L2? zCv*Iz*r5Ua-#_-0<6Fpuho+L#yfp-?xmVes^UI%Hk@4F1gUV}H*4Z3}se`+Wfu5g| zEHDsgghSs(!F2Vb@(0&Q+7GrsNP+l)? zRtA+p8Gd9tEdBRDX6DZGysGXtuBWDsh8zlS9bM^G{R;h z^{N!(4f%=?{1=Ht9KiIM!ewcGfvTA%U+J|G*5|9$0%>Oikou9dMbwRPA1fUMdJ?*& z6@N-&$Wa!=v2dW61-~G|9m@WMvR@+e_9IZ#*vJ=<_^qJZg)B(e8X|yuL~Dd8fHit$ zzHAJKSC3w)nc(gzy9hchUGC4E=M{%v%A=fQ4KZ<;EYjpI+1mE7eTf+}8*r1zvLD{D;ke0I{lW^8Kzrg_^UIBQ#nKD%GI==8~@rr@if61e`SWT z`jYr#ymqNRv)2Q7hxUt5VJ!B}=MoZak}BOt4F@Uv1!aFu*+t5BzI;c${tONHzXBzB z4~3z6&7Fbbld9GIN@FU8?=8c9pcTnZFdmQ-VGYCjmND}{(7X+#T@SGmJ-;QtI9I?~ zKp*#JI@BR4U~OChvr4p~5^YIW8GQFQrVjhZb7Jk(ne(-v9g^PQUZ-o{Ill zpj!Pk!Tb3f%F@E+Jg_N~H{M}V&VmJz7|AQ)KD?cfej^;EbM%hWHTFWL7V~Rt;PtK` zGWinT^OHJePSL{7Wh&otdMCcAZyqn==>Io%<=F|x+3WJ2{f1eyyXOpjh*kI+g0COE zSH5A`j*iOyTZOM7Chy7cQ*%3!tVZw5H%y(mpQHb58Qq#?&L|n9-l<8$M3i{3h>;`A+rR$R!sx(t|ov zb?Vwx=X~eX>0Y(!X?U*x^S^KD8=CfCRG58C6n=n|Zt9xGHDBX8XR+?<_%~vf7`~C1 zzL{9Qr5AO!Z==qP?ZojN#j|2JDfuOpw_`6U`(>4P;!0BWt19otwPenpQ~6R{Pv-sk zq~SM`1%Dw~^cNN1i?1Y0{*ua<dbTtd2OZfOC~!1LxX^^C~#c@ay1QA923N*~eP**~9OD z&}~N|>b9cSdfoV7?Y&!f?%ZBWdOV8PTD_znW}Ulnw05tPZLj@!u(j3cZoMD2z=%3q z+gZB)kdA7zX-M-{C(WdLr=N9t-7uEcCxd<*HFe3lUC{D-aj%!$>~-6nEflLCbo+zs zc9exY%tA1{ttbn^Ru;6Wb?u{YE8>5aWl@)gYnpQLW-lHj-H$>M#&Hzi>U5(}$a?XQ zakhK>y)cdB{9kA6PpKkeN+%fbxK$6JQLYc`pDmKKFI zq&*R+EeZOg`2!Jlcjb$h*Wc@jmdmzkRoWq0!-!`nSHK(JL7&kp+0L)r;QkCkSj$fOZWizW z5UN&}2Ja-zvNV%0+m==vf-0mL_qN`Y7WTch?c0S6_zvpA?5#I_8)DGiihQfnMNu04 z-aTm~ovt)mVb*lT5=QqMt+Q>L z1v*XLvS-*|l^h0wg# z4_muXqjY0Gj+2c-exom<)&MV66w{5nq?POa134ck1;jX=pjg#9=15-$S=3E^1C9^x4x_b#BG3;%N<4CsKiT=$2}^wJznM&0A%?q&D8t~&c7yW3SuUp zu_^7Af*Vd#ruphw6DrOX@E zun`w?2IZ`RF(Koqa~8Fsl|uuUX;v!PSrs|+NFO@6lUup{3z(AQ+KGlTX5bdI+s)>v zHI;KWces6@#;tP=yXI^vCHSPteDiK6Oa+~~xCXL?gy%p14a!srXXK(e4|N*!fmLo0 zKuEJ7kmc z_qWWMOv7t4zb8Cm(CXzVznK5*R z?y!^_sPl$pv}BMWH!tO0Ue3)EdsqR_$SbOM6}_vF7;{ssV&sRXcwi#!*7l&gn~LYa zYntM1{7Fmk#2+Cq{*d@J@ebwxgvbpde@f(C5Z~C_-IsbG^*yQYN`2pVc8j<5ozZLi zWjgPmSJ*te_%F_gU{D>DblN>}FBCilC#7v*PEB9LaV_E?O#N`_+}4kpKhQTHR7sAz zVG==(y?~7GAf|jJ2=)eHT+|4grXrLOWg;XP;yWPH{zUl)()sYyn{-No;6(h0s;VQA zYDIB87Av*nKc*&1z`lnZwM}0HaW!U{j~Qd%+D7d$d&NY~G}vQivPaBfzcuZ~H-dV! zkj^}`I(pI{g3N%c_q%~>+vF}U0ZvN_PCXoRS-NV=Zr}Q7bE;!!u#+T@675RrgPg@q z@(gkVmfQeF0@3a(6wNHcJFXoWLo=#a zrke!?zWe4@6c3u7AY)aW%$t-WizBNFN`mbm>jiIbO9xgC;v@vAi7ZjapaVy+z@1U! zw^i%yrd5dT1Yv|4mh?)f6P$>jn)Daor_X?RmaR0NsQb8LHSk|>fC2O4U7c#b{zak5 zT#-G{ndF6MsqM(}JON z@UMvR9uc+r=_`;F{tHs-f)IEqy{S)@JJOh9^s@K~4e=q7TSP8X4{@7nX>4(a$aK=F z4t#`)Bczn@{;Uo>yC|r(g>>c_I}_qnuwnW=wM>xK?q$+S;zNmcHT`RlC#99FHmtqH z_4{P`?aYC1%8n@Sl1F={maO%-wv1lD%AlfTm2E!IeIx3p&FXoPBz2LcTj4<@HhUtG zjhM1Q7x@4kI&rTn=L(cpHB_NAUe?Ii1rXihMkH5YGH_uUmxF0@+AuhyabLb1_WSVx zoXNDqfn=mE-KeGPjOsn<1K?J|aZb`N5b|Hr?yZ23klQeT?r8iiwC{ zg#1rhFEZ{VM@r`Z4jCo$uwC$Zv>jcPb6Bu0{3Tc-@}PA1I|ktg-40;6>}BQkz+yR* zJ}-^>IMnCZ%&imLEYQx)U4R_h#G7AWm7w{bVSYi25T6tITOt8Svvj#Ft4}s`WUYL6 zy!U>6dQ>9|_C5_c)dyt`NKt-;l)eDsxwZlCfx!eQJOPS4a&1p9xHSW2tueFNvkEH6 zX_*;(s%#|r$4D0$ocRk44)wTH0aL=38H4kCc=M6Bs==c#pY!M|$^?<8F;?r_6pXk` ztx26HMyQX%RIR^OZgyZ~CUPV~hEipy^$xk<2Dqh>s(YDpjH|>~dK-p{8(+Fv1+~>9 z{fF8Sz!j=pyP(&cQg=8+Kcg+0y>t_l)2bGbQo0OoL6|>xaM9Mj*1pz{*gvs7#9!12 z!KTp`!D?e7n1V9z{tt~yKy7ObN@t%Qepg9?+Nz1#t=~|W z!}MKb#yy+$bC~3pB8qq@7ky4mvrpj=<;u%B_|rU44Jag%3VJC)wqE=VwOu_|KZPOR z!&=}-si=@u+?MB+^IzN)l&Yjs-^NY58?+x7=MhMWEXm!}6PGBZba5w7oI(ESPHLL) z%1`U2%e-$_>=FE&-3M7JAQjHF(so%IyVR+2LZ5`9Vryp{X$3`Oa3M!9y?~LSl_4Mk zIl|?Oaf`?{h&1A;D>_u% z0YNC?9m>Z<6i_9UBhOhm(o*k8OKs(hvQLoYH)vLXwd3gxnDhci$QQ96dG-W9irt#U z4|wdqmw_jF7n?*JB2TyXA)5RL(nS!oI))&7*5{=jmqXCMqIL#?R(?01mm{EX83ggF z3OUrGD!|E%Y6wi&e2&-QAH^tE>kx-&2?IP|dYjF`(9K|OEU8+PxeVnuior+yzUEm~m zfWowbO64Trrd+{I_;6Jr)XW}Ca7u}{ER8S0shA4+6IWbv5GGE-A_|3OQUDTHR_NJO zw{#&;-?V2`XdjH3(bZk_0uzh?e*v$agU}X2T8Ar7EPkvmG=`j4TSD==@#Xx?O^N(Y zLe85=>15IHY3LMB5JSz4@mVo3KqHZXjJ-0o%6nk=?suYleo3tf8W#Bu zd{U|r*s2fGmD3rrap^k;>*oTFq~-og7d0&0xpjQN>+Iu`VXvzK zEdGm6YdgNXs;I92{8O41@p~1}DZ+1LTvW*B;$@FkGqzJ?y8O4EGJfIlPQL9;{9Qo~ zvH!+JJtvI^Stm}{Cq?Kdw3Q+mD&T$bZarZw1PPSCcF{y5cu>_(r&Q(SOKMY3;9`UcY9yRI(&oFa6VhY@GfpN%}9Uj9(fm@8S+yvLrFdmYB>GUk(&o2~=AR zG+UF2r~10B<5%5KQ8xWjP`1mWocE`Kid_+9 z%byOab~Tu>XM$OKHkh;L1i#?V2dC^)qFnSBf<=2#luQ0naN0f{oUzY{df9&|IBTC3 zmNyM zhRr^d*sN!4zPYd3Z-Fz%=E0dyITyh>#TLL>NI93lS!7G#ETx;^qIYOncT?Or=- z)$V-!>kn3H_q_H-D+(Jw(gN1iSi9K{quBhs7qz<`*N^o(J3Zf9t;?~}>44^Mx(__Q z+T{T%rB6D&ooL02T;@hD7}o8!zw=&)t?c-ym2bHl9(yl}ybiSnPVUQ<+u~guW}yRf!;2i=?RnJM zZ8yN!d0mO;?{=H++EDsNu;ww=?rhKq({5Hi_FRUSE>Hs)aMq%Ls^fQE=J9y3wVpPWPPWsIJnlwaj&@4vAV1%UFqOne zSM7SYv)t6c&eu@O6$t^(mHI5X%9=VE*(0OV}_tP z&t#^TXL+ivTg+mGhnj7DFKLopfCiP~BGe^YuhqEZi#2iru}DpIY#;{8CT`+|KI}Y_ z_NB-2fh0=T%hMGQ*HuZrz54+z|!o^Re-^vMRNy1YDQaSwQ=$F?) z&5n}3nTK3{pl<2>YEnwu_$9T`!4u-9yc@)eEQbE+oh50XMp8ga#L-1xMU)KO#aZgP zgi;}^m8rzk8G14mRZye%=nWcw9_>u`?txYZ3ez5;6=x=zO`lSG)DL9DQ~K(;Xnr6w zof-S;o+ipUW}>V|r_y$dERRv?%o=F35-YIcBlP4fM2mPXMv31k+yz#8L@h2c2`g1z z<(P8mM?>B^tK(}>uf}=wp2p{3PjtL z*Cpx0Iy8ZIBbPsLSUa$bfqUQSb;CAUZmg}rKC*Lg*P3i-DyXT6Bk--#=MR&8a{)*>=}B+9YR9#racv*W8a}iqA5E+D$WJ4 zsJ64c<6&gki}4}$2Ji0l=ykg~YH2vGSjDGse5_=f)H;1$Y;21$#W_fkMccWs=|bB# zcnN~>l*^bybLk`&U>Aw84Pj!&Q^{zyya&#%>tkePdP&EZ0>*Mc5R8-aDPj)i)C<;l zo){XQb?Qc}g`U428%<&6V#(}W)JtaSB?#K3@f8T40D2v#>APX*I8UUfE6Y0}U-kiO z{<7a*Tkbsoj&`mz-oCtyHS;d@+~yY0H(cKJ{a`skGVi|E+yU$))o{59|FGc)jot(P z3VOVNJNypB(oNZrO}Q!;K{dsEt|{8jmO86ewW?eee??hQG&%XRWJNZgDyAjsC_gKj zc#fJ0YS7a^D!HL_@mYUv9A+Aswv>k2g8B2pxHwclr_I9H#N;>KX3LAqZSpz`tYI!A$YouJM4fAq{lGN1Nj?ePv0{J`X<2-88^X)x}j2t@<{nc-OKHn zdw`tB+CUlP2Ihf^H^3XvueArLc&rnDkRMos!azGP_5cr$jX_bgT|(O>7^u4X<8xfF z5f$oI2c}^Ru#cYKj`(XRbJ(23FA}*#Penv9Vt!5&8hW2tuueH4sx$bnPcY zE&rtajix9+YpU{-qAAZ5UHPMCRG$jzCIk;%L!}96s)zoR$+ntTn91N#Jw5ZO8q?uZ z4Q8<1L&?qwziDEz^Ra=sS-E+u|2vR|Wz}$@ggDFhYmvL<)uNVH6Kb2pUaeZ9>a~zm z{bH@hyPIA!s+2n7 z2p7QbxFo>WX*p5Xd8ft6_JQKfsJ$yD##!I#G!vLkCEpREttVD)? zj(-3@q!9uc6-Cu#LOQ}ZGe4zY4k7m}vgO z%KjP8Lz=9zV##G7dcT~0Lu%kj*LnUvWH=3-e*hBa9EWwAFiZNPky(p2`NzZ?Va>>1 zKf#0hxWfdVF>6+0v6a)Do;7hFy0U2ZOR$evXXzyC^m3af%L$r9yNGnDhKLvO6(Zh( zu-qcWQL%~7eIIUgtH1wVEkpo-?NMe^9c{+s>B8dN){5sZ#l(dQl+RofFcU1DPf(;>J#9h*9+^Fk!i+nxV@hLz~Nrl zW{94!Ujoh(Mhye?d(r6V?aZmMO)fc2QqiyC4c)kLYH^{0o*l_A0;-_HLUjU9?!XgX*%$lk*5 zK!#AUN81p2pd&T`FkoFZ#2biX`0PO6%h8r;FNf&FI3*!skshmvUu5x)!e31#Oh{04 zqaVnF9OUwY{B7xpvRcvP?V}zs-aQ|IHvR~+1wvO zveFSG#`7bv$SMAcr`FrZ%Z(w~*CctF$lnnW#zHpoKe)pM z5Zca}s-`Z;#&d+n*tnT;@wxU>|I`VeHv)nn9z7Etv*3K14#Q>NAa zb~=Hc0U~I?6oNkiTd1YbC2td)sr;M(B@Yp+NPch>Eg?UcCMTh{o<9L4RCGMN zLmI^hC88uuK{il?7GlEyEEv!H!DTsSge+ScepnG@?nGE4-YF6qBmrq@WM_OD{lwEf z>=HcY>{6mP$)cPmejbUJB-xv&(giT>0%>*2MZn^RoTf6)A#09r;=D?=V&b9_BjYPz zP1+UUpde8uODErNAPRt?A!kB0>v?}}e6lj^i1SG#gPcasb=u@HXQl$ID7WuI!cIk~Oc4WP-t_SOwdVsl1$@hIuZss#jq%cv^u0 z&Z0b*l-Er*&rUtm?P=k@t61NqxSCGzhh2W%-3eX)=E~~Gr!{F#hZmKHcX$nzku}1` zkf8@2W7|L*_|)PEkM&KQ4h$`E)7mFVu}x1bZ+v?`a-}q_abaxZ#J7lJBqttm*-tXN z>B$Hu)!{9Y$YBRWhcqWz*$rA++W5oD0M;!gx3sW37w4-2`j-fS-=y>&a_>AY(7Bj{ z&qdBE#x7vDP06ie8z3@D@qHICg8r|PxI(095nbQ}1^y&irhN^u#zmxOjlCRG2gWWE zgWDCRiPHIj@mQr3m;*dVS!1~anO{OQqVM8&yMoP>f%wFX^Z`!CB08zTmIw*n+`b}9 z+m)|G3@YPXOBS&WopDX*|JtGczuvx;_TQY?e~|VcY~Si*B*_TAk~&jKKLdm1aZ0wA zo{}MVv6c5MBI#T@uHxEQOj_7ZUvM>6OE_85e~jriZc~>G@4aDS8EaOA3*Z>l6OGDt;+Y)Sed= z{c{s+WxUq}Q<4ANLC>r8nWG^l#qTHYPABBZwt)OxBEh^rvKZfMpsr9c6I_6+p7k@2g~ zIJH}UTWqjnBYM!I(={<-@=2T^j0=~qqhy=nJfP9(h%}J>3LRTD5*!>S0Hf&wa$U z+GgP*k)ol1Kzm_RBA5HA&JZF%J1Kak?nd(Dqv| zcrCTpQRq5F9Zj5*^izjklpL#l(5#W6p|NkQ$}q&`xF5j`Y8Ku}?qge*?3cj#YRgl^mC1aAz3LMItma-JG;VK^d~ z{0MEtMzYfuSwoR(7HI(y`*Vt2gtCs|Ll*AF=2d7R4z#YpLZp!RnmQ|+3I!s?gvtwY aQKPT~bwyCQqEyxD-Ri~ar`5*%>;DJJ;RELY literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/cpm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/cpm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df7e3e7235a84030d35c3f1df93fc3b6a1dd00ac GIT binary patch literal 8929 zcmb_i&2t+^cAxJV33;9`UcY9qQgIbL&;9Mctv>#-qWmWnCLaTZKSWADP!)wKzQRJC!X)gyd77Ps$Z3PC!R`beof{} z@pMx6>&c8ilg#?F$(%nY`EER)9P^LKd^tXzEcgpD@5LvQlm5x%lz&Q=EAey5Y5%m$ zSL5fCMSoG|r{Wiq7yTELGyWM_uEl4QbN)G*pN_wmyyU;6Dv=f0>n~GkYYU3s_?g1$ zY~~Av%|y=nD|@>CDmb%j4xG7y^L=pU*)edA6`a?=InEZqStvM5;GAG5!8uuQmYIHC zX`I@*c(vP(c+_o0ul2g|_Tsf`;mTW!NsmSGVyl<*!>n^NjuvlqvbDu`2CJ)`?&_7W z1xD0aUCYv?hqPS{Q|NDXw}o-J)5?T#t&?WLd9R;!dfhM<=7)oR95oxN(7IjF%GGXv zkgY^n$igfH!@b^#2XA%R${QV>b*dMH1Z}xc7FhuQIryGSFjn{hY zr(qh26Dtu%{Y$-gkaXVhYKbX=jHzugT8Q z$N1|tF~_2IIEb?#U(z7%g)HJ?VXa+sRV;PT$s!(RJ&t}V#U$SwWLQct>X+VMxy-}W zUbh`@Ujpr{wrFQ2pK?)1e~L_`_$m}iE2*r^v`A+^hR+-J5 zFDg*05_7*$eCJEWQ2Y{v#}gJtUAgvtQ_j744fnPR%r#Y~fkb;2ylS#I$@gm_e9 zxrIR@_fS|o$_BjKGAE`=c?(HLN?hvjt`7b$_zB^TN{Nd!#H>ZLyOs?pJH~_+qzrYpGEZCb-x~ zCkQ&-P8I~bg06+PN{ce+2YudK=TpQgvEHq&EDQHW7!RViIqz{^qIVX?4cc8On2|8Dd!k9 z)zfySa!FVkeQb+Xb@UL5(^g(WN?!pPD)*G3dSCrS+tqe;l(41u)m;@iY@+0=;H%g1 zam|L#&!P=~iO4yShQVLPk5F&>cCng54fw0{UXt(ATN_((E99(+Lh682G5e}&s5{kS zu0;j@8eTSB+Nf@rL_r|jfHZFqQ{D@L%|RF!Z*+*Z(9?`_5>f6Ep+(}agNRD84Xr4S zvETa>YETuL6+Z#;BP6+^j-uJd*P5n%qv_fc&CniOruMaFRUg&nWX%?89(bmUQaVIt zUxiCrNiS)z3kHP#9NCdW)&xpj0^bF%ggn%~U6Ts#u?iI2W!0#>?y)IJ zYf*(wM^#p)QrneQ$ndA6^4Fj^bHc+ij#2ty4cpUvs4U_eNk&DfJ4k{xpkpdL%5H2G zC6{nE$kqV=!jt(Fl1i|8ED3K1{T_rAS{6hjxXpEvKaXBpw2z~Q^n=PIS?Ln|gME+y zL@0rl>3hnq16WW3K*YK+bmrkBY-s7RpDUlE260Xh6aXGTEpV)%vtjb(JXh3SqT1ck z&>?QY`!Vq{8*n4roKW^?6oW}EREnINA2@cnQ}~@p*P-;;Hxsk=s`7` zl08scnz6-IHV2-Z1AaXuhuIP= z=Oy$xO^!uV9uiqa!t?L{a)rNu--h91o3fB^2dtC$-sHvz<0i~5{{yO~^53VQWBf(@ zI&hzY+yW4%eF z3L>0M)Yu18tCtsQ^ltPKkR?uzrZB=R>o!UfKXNRkNy)MEe(AIF&ZQx+v%VaiOAQ`_ zi~Q7IqH1~r#C0u|{_3g+>Z-GP-JpCGf5fp=s5pR!{v>EV(wrPf0icfo0Rq3`G&6pM$j^wp z4brIQTlNkyt`hkL^!l4O-?ql*_h_%@ymaQ$Qwj{7ettCICNQV zF9@pm4~UR?+&3S>gzasu3F|}YLJIrc8<%J!Ic=1*E&PX+JIbPvHTeihdUa$`a?5P| zdeW|Fuq#zddxW=NS&sH=tyKLV?ScWzV-%K$Y8Y1i5vTrINCzHT5qi>fkbS$us)g%l zL?$eX0hiH|76ljti(UrtSS{(^O5Z293bd@FhulW848%+S9%RxgkXVyl0DeNK_w-#|=5?kZ zZ^%3xVB}4>lcWSH@Kc3V&e>5=);6P>?`biTtxkkI>hnehPs+h|9vpF$IdiO8ej$9W3>_4pw%8{;iM0&;}IqqM`|7Pd&-Mi1pFT|!FNKwyfM z0n~g5WF4XGuGS^KhkkN`C(TayuX&-dPDEY78rzBerR@74s%~9Q_Koe=+ zG}U@+IK*+m-#K=8We>RB)x(&%KtmpKu0JFmVH_uOn}Z@P^lv~$GSL5ol+wI`jHdB~ zFwv5TSADE49>zZh2wAo3UvX;LLOSq}13^#fgG^vx<+sDY$OZDbQ5on*xCcCB-T}<3 z3I|KJLJ^oI-`4}_{}QT=9?H`~(Qo=!Ap1ynhdfK*+#QOE0ozbA?nqxSFf)sCNOtbC zW~F;L=6|WSHKsBB3nwe%owui6R`zfbL(585rP@eS%#f%C{94!6;g1I7lWOaA~dxD&;!-u!Rs*8#Bo+iVWqhVI6PtGmj_FEcqv=+@@A8_7Mk3 z84)vi*jzkV4Xzn+4!Xm?m8o>M4)~I! z67-puXt*s92m#?89q6U-tZTE%uGuB@lbl^3;;xlZv<5=Br;?a`Ir`)?lt`Z^S6KY48a%ZI29R-AsP))A0W!F^~#sqW(g7Vk> zdLADAPHnu-{}=S5ILZVBlE3=hej&r#|Dfr;3~~he&5zK}Lfm2O{ZT8Dx&zM``N*G| zK)&PuKgh=_goKDbRDg-qw#BAc4Tv~ZQcKeaPcE=}R73pRh-yI2S>)&PeA|(6Z_}Ta z{$d@1a6(9PGYW)W>G4b9APwVdE58v+b$lD8L-9sBkOyRG^B5!SvxH5+C>%K($&uw! z=2Cf&YL^*Nj}Zu0Ag$VZ6}hT}MXhI5j)LPAApBc}lktx#j!akwiqY?>K=|r#scTe=Je1+y_XSXD9KoY3%<&iH_r|2dphC?rTgds zc^uE~tNbG33>MyB#CuEH4R7eJo4b`^W$3|zl=pNMcLkfP;}#zmEk4FbMGI)K$1L0i z=%aRf8rqOc;~+d|Z)kiAd3PNF(2@S4T-h%*oMyvBc=|GJ90FD{3iTceW8Oy2dw-Hw zM5}(0^UhM+k_=ZAZsLwZR1VxU0g!~jaB=6?D9X*`)fI$4;we9dfrZVZevCUm;Y3}Q(oG=$9DME* zA@5DPV}3d7M8OEDNO7ed7FIqOC-*=K{wS97XTvhO_;JuKKFx|A)uv@`{JVQ zZPG6~b~>i#ASkDLPaT|X96DD%)r{Po^3;04!}&+8>&1IOjjL(DdPUng`(64tIaGmr zO}>lanvI#??leSTbl4x5Bi?Z$Ln1#Q@|Q$@NM!%BrPD}$GtWR~=#!hs9N{R`zU?Tu zk2H_C`L%`nSeutX^cWOJ>YQ&G^~bXB#_Pfk0@iEcHpR-eX+QJu7vU1S(t} z^`z(gB3(}{<#08+o`T#@#x>3f)jGd(<~17V$TcZGweSF>kKn)YjIQ&e%lLj; zo>}N%!{tGSTo3>JQO(CK23aRgm&QdIxZ$(Z?#Qk9mOf>0P9D@{v{Dcgj8Tc zH>8%xGo^C@0*Le5n*_`05{NN>N_8|unJD4!xb7L!bJ9JHZbFgQ>U#Zjy;;9FfBt^~ DD{597 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ea1389a6695b3365d8a6facfb73757b93729817 GIT binary patch literal 3318 zcmai0TaO$^74GWW^xW3#U7KJBTOAWY2H7)2mIwlbV%CZYS}!pPJT#69_%}!`-jI;;7r@wjRXt;`FB09VQ@2y+oI2Mk`u&cD@5*2P zKKa{4%lfCDoc&x(K15R~0JkEG6Kzn)JfNj~|A#aIx@WLm1>w~*NykUIG7tW1iW zrF)ocej-IFS&|FBQ{>}hg0;?W@w;2kXPdj&Wb&>krv?8NQ$nBGOr2xyH8QT&6-`PZSy{+f%X9WKy2pb0Gk~xn0u9@oL{8h* zKi}q#aD{it;x2k0y@$Sq-sko>_{tg+-r~W-eiQ(|jT0B3!#3{%s|~Ck#{FhI4)_K? z_t1+vd{cA}dfeA=?DF$`>!BO%7k-~Hx9f(GV_%%n zLUU2aeRCD!Ur=00w^T;39GZ<9hoI zXX_S{6e~+vdkStT>wwZEPfAJ$ZtW_Oj$fhp4XN9d#wk;Z(r>JP?~dk5j?yA#=_pP1 zN3&yOqx|OZ{U47M^2v=Eiw|K*HCm);Hfr+CI)s|38o`^BG#k#2H}DVoy4_xuhC zh>gGV^yKnu!9H9|TDuGZS#LGObCwBONQ$gGt=ln+r-JfCYTGbQwVhd62U08&g|LgY zHP)Zjl8?57rv3piLFxD_S$eRN^@u#SPRY_Q?Zz&Zd+Oe{K9BC(%hodBcGX%~^77Gn z?&y)a#$B`WL!5M~;4xVSl?VLy?mNr&va{?~9qbc%^Jq}D9=VvcAA=Ic9`_y-xl^?& zANxDl_ba5*Wv}WT_{)CPuXa5Rc?}Cil zn_7DVZ{XCy(cyy5WN{GrN2agI5&Y;Xu3!6ytChp&D-CW<7C73Quu{4S#?#&>XsC<6zRPVM9!6tX!!$;*qR8`gHp;CW`hlR0iC?8 z!P^>qPlGD}(RYx5PxZs6GA6nMUUuqDXmgr&O?o(0mN`pw;+?R+7945Vdu+Td{%@ z!G%l9CQP`2hTfRfedfcgHu0Z!U{d#)>zusv zx*=cjdDMozJpg$LH1`JQbRO4!M`_)m`k6OR^|(jr5uO06off6Mh$)rY2c)h^Qd?i@ zDrHh}?cgya-`CR*03!c0k*h+Q8`S+(8HiDfLF0uh<9%*P?a#6epd(NH1R$_nVsGo4 zxleGzr@mwV*LUqFwzu&gd!tzg)VF_Xo4wAHt3NU{2AlHdpiukev5vz=UL#QtPvl(b ztgh36zCzU1l?E?6{ai6uiQX|QFV~)-d55TBQ^3u=qwr05<{0qw<4@}9@Mg4Ctk26N oRVKff++?Ibqna0?9{i#)$}e?VR9gVuQ@wPQI>iaJ3k0^H>;KynX2 zUAEjd^}4a!RGp@mq-`cMQl>9@>GVJ7Tc7&g89a14)4uqj&y8it>hE^|a=FsRac4>n zKAiKNgM)MXe&6L_P_4QOeh>fr-!?utt0@1;%*!0ZU@+_5QdSrVx?nY$Cj^{|88I@w!b7k6!%5lZ3$g~|*t0=^ zOVNDX@ES7hMho$xwHni!BGI#`>M+(ko1O1}XYVMx=M!y@1u-^%v z9`vJY&9h-ROu}%x84rRmT6>r4*D^Ex#s0O-INR%_nQ^|Cq?z;bFzpTc?I<%}8x5my zy`^SazYkh}MTDtn_xfRQX3+2UHo&WXqdy#_7s9k1w9_^i?!{g-da)l|7)8j{UTJTH z!Ha1c_Sq;nrS2#R13w;w$k?x5xVSzLG5S$Ik9OMvEo}~hueOsgJANS)c=*g98pZur z+M*prVU*X(>aT{wsMl$yVNh`9zx}P(@2fQb>fp*@7GCLWgsCqEL;3oP?HD5zEiGGq zeb8xNn$$iQUkZbu*Wcg(iWgt_de{!oKoet01_C3%H2TA$*hlYt5eB19h}KT>B}dm2 z?f!Q5_-`)1G7wSVC%s*Eur51@eoQA=%a($$+a5)!eEs}`B$>qvx(r$U6al3LP&YM_UDVBj)CBd|g1e=RF-_JSbn%fV&y zy*~OEhd~dEn6uoXyp6qBjpTOlW4Rx)T0XUy;RHiZG7CF#B>Ej|W*AJ{xO7~J2a+oL z$_@3VqAIC2R>#_g8mNm(ppA_{zix~bUr&uOO2`?TfpOgg&m3c{+q(E*xQy4zq+IW(PB zHaKsip+6LZEzuyW6b!EPWj=FXZ%3o>oDc&c2;G^5_xjy_%gT%-jJo0xFh!H(Q4)4v zcKH@0)b~44J4t;1mU4Uj)F=_BA_zxxD(YQ2HM|D->_4}5`k7M+HrUg{c4r$Ro}9WI zMe(UzCc>*>XM~NHXOmMwfjn!&Yodi0eI8eG62x^h)m1h8ox6s?yG-A4%(`09b{A*1 z&)Q`49nSmYN4>-4K7%WH5@f90P{!&_^$l%L+tZQ5vfotqRHU%blCOfVUQ}+W>sT*7 zBc3LC3Z!L-HQX}ws%IC|o2kAy&2mYeWvPv|xzZNFw~}>$l_3_cPb0oQ@)twA-3pd#}DC>vvG~&~GAG z$$793AR(v(8o)pgjnG_Gx`qS*BQOB~5SPFP000`GiAq6IT>ZTNdP5BM*1|m!_2K@(tkF+iF(}I1neTvV31XtL$ULpq&~v&zwY&I$EA* zDTWGMkQmYcZy``dstAx;WNuN})6*qcR|j2wM@4-{qRR@=Ln26NeRt_KsY#j>9Re8G z4V$~4Bob_h5b`aW>1MkR5;prI=p@k`berK-z(JZcGj**4b8GseV3cqz;7T3`*}yD) zzqY5Q3VNa9(s1eM4NS1>=1;X94a=dubj$pWXaIrWZj$BM5_C2o=swhz_yLuttv!?J zz2NFCO?;V;NJKdh8uyD4jEl}79=@=}{t>V&MYF8;4M+~f-ne7k+kIjxL4$Kskzf0w z#Gm8^Bqoi#BtF5nya>`dCZ1rL(2~FV7*d`=O)FNB%|3YFz*;8>*j+C>?_v-{y z0Ij~IZRs~vRT*nCXXLpZJ2khg8#dt=(&We-Wzzvi+cQ!Z+E?Az(2@<?aFEIm_OK0 zAfA=)KTp4MIuYMx7fs?w492s`eJV>_K>0o{sdXuER9JO+HL!J>YE7%FOg*mcuFQ(y z%-+tb6Z0*_&Q1go=8!Tb_VAMXor-u5D|`di*Vw?y5Qni-BxbGvTTuoT z4cqI6S*U0Wt7pMt1JZI2jEUmhG1`_JXt3SvJ>dJajpxVC*j!QeoU}Z4_EnTQRQqEc zZ=)+@tiO8?zt>utSB5_XccUEvXg#~vA4YB1>lHy^65k_{*!40~uaK;hR7j>XFTR1S zS7SQ>iy=f`3@(WkFk7X3k-pCg?xl9x*_5#3%_ZT^h@O~_RozH8y@{n8^kT1CAOLU) zwz*f803~47Z}iece1>f;bbIi}{KNBq*ea3M!Ou`xnn4wne=IU&KgU;K0_5JhU2z=T zfdKa$opi=#f&3RXi+GNssff?vCTL2C-ytCo9GE6K-g0M&iy)bmYctsq@g=ekdU#-* z5V1JXQUSm?F^#)4O)R~3*J+(x5G?QDI`o?_a>*Ep17oD5m2~(hi-mup!#~mCn;0-a z;HQLk=+OCwzfuC<1+Rp3IncUwX|_~?D$Eu)sD#*L8zNumIy_ zP8u(D?4_k_ej*-Ir}o`Ctp4?Pl_rKPp}$0(vI%c7$tp~4UoLVk{Z1d+9AaLP=>&E) z-OQD^ebpZhU{b>|@WYAwnfn#FsB!iuKZ+UiI<905ICWqeQEYP42!a5J_TlJk8DnP| zN`G5RR=%mcNx=f=I6)Nd4k!i88avyjIF+Z0+H0)cFG0XiUhtkLFSVr^g=dZ{`+N() zoUO_>DXI|XtbqzmNA%aTr?(Y)eH*n$iAOD=26fM}Q zVvx4QHJ{R1nOzuI;|NTME~>X+@A0-WV|#G%rNQCNrD!4Yr~5!~OM8t2?sQA&XkWJE zIovUPXJ+l>){2c6zL60b7PBpm*xy)VN2i88NIx&L3KgV>|u9NMZuVG zZt5^lsC;;L_pq0tvcMmg#_qTbbz|*euidc571?$brc4!}p|6V{;H`+m=vdyCEn?)L z+!4V_#WvC+A{mejN!Z6$EmtNxWN@%TkR%|Uj&+xsAyaoUbvsipd-ird5uRPlh*#!% z`?*%hto%L$$#OmEQvNrtgi}ZYt&yCmq=Ul_v;Dz_nFy6 zU@Fbk1z}2jfkY$W%W~~O#UXr<%>LHvXSk4p(Dqw&$PZEFA8{qLrE>>Ysg2++fx)tj2A@_9zR7epLGWB^2`mofFJW9AIdjJQ`*q z66Q#aLI=2d*a3##fdd>OHip`8h@pYxy$kIV)Sz47dm>)OZBoys0gtm`2hl}wzI`ne z>(VC4N)a zlGU4JJWBEy$Su{=5gBhS-iPFwjUX*#NXY$>JBrBWf$UtN=gjT!ntz$$wnEF77+~`$ zed6nA-&^YTM3Va3g+m|^xrWv0)#y&=i`o_8j==MC_YdL^(0y^zQZE8_c z%-z8#*fXL8n9ZH0p$y~0O1dV?ki$~hicr=WwcOo zyD~Vkjy-ck=fK^^yTy-4-Xi%Uke2luAzS=0E1jN_Df;4!sR-1SFj1<3>2mG?rR3zo zV83HHF41&?|nQr5^!yX?)+sl{^Ci*tLv16Tr0) zRn*Md``Y9811!Q1w+*}Tp5W6DfGk7P{K;_r49Hd|Aj>Et!g6ZYk^pN4mW(vw00KDV zNLmU@p^I8BK-M4!W;>4}W}&r~RVQ2*`Jz;R zfXLgEhbCw$nn%Un#2?^s1V)}<>n#wkGKA@h$Qhy@y}(06Odg(5S!M2J9;`__RxI%W zLU2}*E>8YPB4cdgn=JemTbq}5bADDNfehz8`CSkT#hYxwAnx7LXV8t4a8#u1s~qt% z3SEbkJZ8?f#fWJGZBgSrmeaOv=reFO-iKpkcH37Rk;8G?R{%3op6xb`?EeRPf+~(k07d|&8e$3tay1;7Sg;@Kj40rAb-ttq zt3e~I2MZ{zqvj&gOL@BM$j<{zZ(iDm4H!blv+4QLO9OGHJxbcq`3t`l$_Qh;q_uni zW%RavYP$Rw6q->&n90B!!fvqhWWIzjD)RfRU8X%Yi49GKe^G-X_8TWUG@oIx=!m}~&Y3TaF8&HN#9xxU3NkrRkRIe;gW;9? z;Z=N`K(M|)PMlf!y1Y3VM8zxyF?Gsk2a#7sU;&4{Swk*+e#nWhBZlpCGb?rI(jlfH zvNAtY@GPn$6(#5JBuaqQm2qXu+^deO;|fr_ysxYHwqR#>+SU(>Cw{QAP_)%8 z+S)Ekw$ZzyEeKjAaPU=uK521ZLuJ)tl?``;|*P$x0DLN_Q8E7#Q^QTCacEr;D;{ zu3f^n4!adNDQm^?*V^s^bB9l{*0SoMk5gI>&NlFckyqzQcaR^Jg*Zq|#0hjDvkRXn zb8y6xTVE58^E+DY`vrHSuGd)G(w9D+I?O33eYp zZQ_f)s{B?aD88J@Tvqz?6uc-$q8d~yqVTgk%A2ccKQjJ(clyQFtjx{6wvhOB z@iV-7y|r+p?f-vLyfih%!z6!0@&%G#k$j2dU*O53vBUc{O$nI^e z>=d6PSTMn~vNS^PfWlqOKU1)b#vQ@7Tc68p-wy^Id@W(6*SNBI5Qzwx8xL}X1a^52 zA&p6|g74J&3E_sDjCkxl#xGRXa+I2Up@OYF8#R<(koWPUD(3d*+QHX5^0ON6@eivz z@y2ER^+T&pzloD)O&?lSc7(RCw1vG#KcoZsCseuBHOKj>lgyJeI3rJe$g>CHpMMA^ z=!zUYhsD%PR5~@Q_uPjxojU}Rj~)&VJ977c3hwB~r%%X()-`-f5Rdx&29QT4gP}Zv z^G+Z2;`^SQW`P_4p<{Ss9=1PnPo&k9tu7BC>-LxFo)LRatj(2$&0)&$s?Q zNM^w$#pghI*oWs|;)f)3<^(M&DF-vcC&iG$mpNa=cE+PG5ZjYagBZ&vR7aD?e>ny0 doKPz`_~#c@bq$xiH}poc@p|Ls#+l{T{{rs~mG%Gt literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6f8b6d183e49218e1ccc5025ec15880d7189f6a GIT binary patch literal 10180 zcmb7KX>TOQdG339dQJ|xy!PTst79yQ)M?4EB1>9XmZe=vSshL~cG|sDO>)TYp4sXi z?k>~JVJgr%$=a54A_xM~2mXa| zj||s{OxKJo*HUGE!7U))2n&(z+9Jⅈ?3xB5Z}FsO(lmxDZyOnp+cLJFG`j?vw}@ z!$vgiPK&S;&O|5N6Va?Y8_l_M(Y!k^(o5k&wCFC1a5+2~opMi!a3wq)opH}ZXWg?R zUJaL`bM842u7wXp54#UX=iT!nUJoCM9(5lT;i>Sk=yCV)sOdIEyb*padcu7|gr~zN zqo>@bR3$Kj!saqPYubY1t^`(437!(+GMfpWW+z0R!DiXq$A4&XOycBg<5H8=LMO|gu-E-0$fthuosW(cn!4P%b~Fh+*wi2I zbOVG7@B0xxfH$>#_M=YAzcMKO?dVFtSUX;)0rfk+^j6?AR9eE=k`DL8+<4u7wujMq zJYc<6fY#1XCzH!@KinFF`O_hKf!`EzZa(7wV=Jek$Pdr zX93R_HrDz>lIe-pP6O_z9gcP?{SSS&mtyvjU|;^8T#~$&c2Ym|;$Gx!-~*G~;z8Qu zaSJ*+{MiT<@La%?{3`@gQ)QnXEmIEE19e*2Ra1S_fTqe+q%mz?DJj$9L7vI|MuAb`qcBWj>ancF?I_GIhi4PGqb{hwyrUgSs0g*8PI^j>KzSVcSnb4s;8{Oaz%>*ZL zTg`RWj+&+1i2QUTH`l^Whv!DvS${S+G1aXN*OJTb7Lezs&pqo}*dg(H;F|3i(OmC# zuH|~vj&r@`r%jumLc?yO6?T#!Nj>BS-0Q4)&*sJz>}%H&DT!+(AvSH|I>Ggn`<}>~ zrd8uXE3|i#Lq21^YluBeezV- zZ*4*Il9lZ+j8^2<4z35S9zIG&la)1r3CrCZ`IIM=77evMIjpavlVllUMm?n(s&zQ4 zl~r3aWb8{rw^ZZM(i~Nz+%F8HjxvVMpF))uo+CeDMUe78mmvZL_CayAqRJ{v3v{Lj zMqn-|YsM$a8teiyKd!hIvw)N~D+p-^MOF+P=1^?SVI@`udX_#>48<*Dm8*G8qS6p& zR&xt*2ZYKip#T^!63Eh`jWGDpu!?vnExC++k}7*pf&)dx*4u<4c-Y)~%C3%>ftZ;o zn`WlZVgqhz$->*pkFZH8Wl=${g)r)Dnj%&}h#qqWd7B!_KO@Skn>NbYXt9Vlo=uy- zhM0p`DJ~&i*s~G-ljFnpZ;_eL;c7>(vp$i2AW9W~Pn(X^J1HCnCGc;0q;e;PvU(Fd5ip)kf-=B-&R;*PZcw{$bXfd&QxZz z;-0>1h%go#VKY60(sEAC%wXkPv>s>EC8UZ`<2{4NVHJ^cjw$bBT$`tUrt-z~A*`X5 zQ8~cJZXvUnHb=P>v(twWGWTdDXAm!D#jKFoe+=`zfBpdRd;7?Wta^*qMf7rSKC6w! z(5$Ztpy5l{V;|hsfdOql$t%fLyX#TV^BLpMAaflLe*gWgroo>>fIm-ya`LJ)LjZ;Z zW+K^n{4x-p-&pcvw&at}F8MrIih5}eHtWVxaJ?1wlJ<76oI7L}y>tWCxm|w@P0dCA zbwCmz0>;=2239d?D@y=K-9-gz+5;%MqTKm^|6&;J2&TFE$w-wy* zw*9adxST*zAk$$_z`j2J_j#`(#?Vs);KxI};D zc#YaI@$F;>0q=$xW=ER9NI)~A??a|F1Vc#svmC^Xe*~{Y49h| zh6y4BJUYfs83)#G8EJ*AEP5=X$8u(^(IG%zy>o~kKZikq zoxFh;RI(ekQ;sv8!rvtE4hdRN{vL_9A(|CV=9+&<;v*8@Bk@}hu6}iETk4K_HCMNC zb=xg$$*FP+{W)_>v^TwuV5oBI5OWdxCoimZ_%)xie?m$!15qttuc%AdGY!0F#Cq<` z9bdD4t=sDA?Fs=(>_-6r#qo$K_d*JnJ!rol${bEh!pXjKS|G7)`J)Ks)(3+0=Y@AZ zxigoKuY!o+Cf2A4_Oa#(0tQ)p*gGL8cN+s3@3_gT*f8MLB1eF21)q z(*7f~hqf)v3r$P1196o05O>e89WxzN0K^AWLIbBKgJ-4|5eB9R^ITwLibc6SY+^}j zx=qw^1Su$ihR9X~f$CEdtxpOnWTJ_m8yLX{{8`AnjFpR1Pck>xSP#P95o`DY3JEl8 zR{1w6OfbZMgG8AG+0DEn;MLpk(vJ8124AHNVtTm_yymY`^eqzKB~gPI)OcQ0*=U+_ z-w&ZvcF?BcVh0R&vFQF^ke-lnbW9+RKp%-vWwU|ZEV0cJs7A;JRQi0UFb?3JF~zkm%2FeqvfU_rvnU52mSeeLox>G&A5+p&~BuM zrak!^h~vtE?M<8ANC}xW!LF^eFbsp)BNxZlO^2_fVBiQr4Q>$&uN;e4TT84gKpBCKCTS3Rrt3^5bWetq5|HKUWqva2MW7#4ql~# z1uEDG$xD9(P=X7Z|d7x;^rg|>=d1-4)X3Y;b;wU5?u4iO40to^3NFJ~pu)@8EzC{5=x zD~S-Jx=!k3&Yp@?*<*lo>Jeo{Eh;B#Ihi@9fGyC$S%VXB(hN0AV_PG^eDIV+k3^S* z08as<+Z4S@g3S1XNP`U)+-9)N?%afZLPWux!Uh}dERj?nm6cS&t&URY1m!wZsh?+4 zm{N0VBJmJdW5V^5km)cAL{H1AqrqMSn(5EETeXO`g2D>&TGfeKTAFPT@b&+&8+R5b z?7DFjo`kv&k2`=VSzAwZM%#|SiU!e)8eO+j-#18?FH7T)A^u+t(!w2=(~ zq+ksD==&3OO(Qz|v7J_tU)xjPQ1)QWP@|qg9XF|+26-QmGA+Q+SUuw2BqK@#n#PmN zLabvLKPbSZsoYdO|dF zF46uWlV&NJ1{6Bnd3-;1`j@ ziE8o&MDA>pS6iM=Yw#?yu`@g;1aqBBf(Z-9z9}7*Q{oIQ&)fo+lU&^ph|f7SY5>{S zZtKS(TA=gqQP(s9g6w>gzI+ieB0EwSG+;1D&X;uyy7c9&wKFqmF7l$Z8f}&cyvd7$ zqg5hK6!${1oZmhq6Vd<@!AAj(W~!#*;IyaG&iFx{fEh;FS|I060bXag=Tp~_`wbf9c?6E~DQ$6pKGyizg=*X%ef|!)W9T#gmCh~?L?2dxpBGNHpbA_v zf*O3o7OSy3PCE6XTAYICc!4#7DK?GVDInMk!Y5>S%@%iSrrQu0HVq70%=~#c2NG6CKhyXB0Pw7rWl0P~VsX#M0L2M!2m+DN-p{wK(9g0$c@ zv30Uplk{hYzMzUsqcE_@#$ChOS&(`&K6=a;xe%mMPKl|ZOOt_*=E#|sC_Nd33wVTj zim`)=k;57^4mVbrK3FTA{~3z$KZVFGdHFUVuwvM^kl~hMT+85$xdlJMu@A{DxlTW( zZy_8jo8-tcw1E?9{3~S9-0*QTlQ-l=jc{JzMzTwHQpFbeCtwzMUZSf(58sW;svLA` zyPd#|dt{Ml@rlce5B*0B0MLlf?6X;o(>(yda@PXuq>#a97?^Dg&;*RaJGBT9#n}e_ zr2`mU_{YWBhL(8G0NCCJyDbvS+_xE+Fn<9aD0>_4tB(=K1q7FHt5Dol_-DApEA^cP z<;>Z1b}Ly~aM8~)XTOxe0g#o+&kp7cR|2l}tTi2ec`AjgM5a*Q9Ign_o3jtTDpCRP z2l{3O_Iz0X7b>$pRgc#Dg?d@}iLzTA)kGiauTishv?g6T{u;IGX?3%fRkK=FU)M2m z8yAv}n2%ywhx24An}XNQ+0#_z>SKbyIIDaK%eG@K6ML0jI!97i!3fSElm9jetf?ZL zd3oi?g>iFA@|IU9|5Xw%k$4RvH(0xsx^>!DOn9ZCBmqzO0%h88_9oZbsq~e=H@M+< zgWT9|2iIIH_Broda|__^lq{TQyh#nuLo^#w!#|~jpOg4=5`RJB6bXV*_ayNT>0=i+ z(jM9w+#%#8a-T;DKC+eO2940Ih#PR$zYot%QG)*^H8p)C%M)XAwQE=LY~k7Fe?`5H zNH^K#ye!nFe~V84s*(^o*(Uihh~^t0`vjGcIAYS9;zV$8yo3Z09cqVFa_b!$I0hj8 zCAs??a`NG+8uncStqs$WNBLFV#>FVL+%G?tK4Z`!);KI{#((Ubxfk5Vhurtutn!~> z*sEZVK7HgB5-k!0F_WB}{3c~E5*mqP3}KGah|KYyLgadaTZX@<=o*R9911%l2+og@ z(Wd!(8-Zigp>{8Ia6Z>WD$S8)ICr%YWQOEAi!{ zSK`aZ{QJbC+(#xAlmvPizQp#nPq%2ox#*sMP}zZ3MBLf9=N?d0URFzQW`(|#Orh{n zx=%cyTL0E=c%LV3iza_)fBXX`LA=6!?g3591IO@=Yt#i<8bMbkS0C2srR^|T#=$A- z#clkD1YJCKy24xFJ~OG~V>O3S+)n|tPEfZ*uqQ9iJv4yRHg}gMb<~GXF+c%l0uSAa zD4PU8thf~j4ROzkwDHa54Ze&1%OK$=X;?zlNXv2^{~OZ3k(1{!;fWKTH*wk&%unq5 zF-044x!isk+Y|R0uRz(9gmBzw*xEcC666rWpLox!WuQwPvATvwq#{K3ZF7xY-wU+C literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12cc644c7b76693324bdf6b84f2803a9333409bf GIT binary patch literal 8525 zcmbtZ&2t>bb)WB<{lsDcf&@fTONp|)k_AXs5?d}s5>1J+O%kF>MM)-dCd2K)Vu6|6 zWzQ@>kVOs#tE3ALu2d?A*j1!&I^|EuHHTDAsY+F!RFxBNx$;ZQ@4cB_EJ&ECRA#H+ zcE9QA?&*H-_daG&tCcleC;st2yZ`#Oru`Q+W-kkkk5JOfx~2)u*Mu&NSWgV!NKD^M zEZ@?JXU4W~G*|D2=zQ=VZE+u8Z%yl=eBvrr4buX?Zb-&K_QaqP5{07&{ z@qE(sn_REN3(2Ctm>ltsBuoBMa@0S{{AzqGS@xH?UW-?fm;9HwUXPC_C;Ss!pNmf> zFZ(ZZy%D#PQ~oKg&&RJMullbhr~T91ZpN=A-}AqxYmpV%Tdz~uFqSp{`=4rJK`h?Z z#A4)Zwf9W_4RDT#C2*Ds&YR#I7019iR&ag*&azkmXQkl01} zGfP+R({Z&;WnJi{nesjxX1zf_jFtWQXc$N9ZCx4tK4^spK0D}d^tx!4FAOf9nYL;_ z?+-`W#V89!n1yuv(5Ge_7_IuHup5apSr+waba33u7q6@jWYV@&<3g_=g%S^M z4#ZExG*T-UBZ>ZJ2k|KBUkYUy$5EX3QuDt&|HZSc%Yz#SS(w1kOA*rFf3*cZVY5Bg0#0ogEe?CJTrY;Lmd&( zMmUPIAm7v=9)u#2YI$>`7^>LppqE85%m%cQTCu3lMj5uQL#Hr%m5N6C0Vrft} zq>HjJB2$Ox=CgfYfS?^t6kFf(h6J2MjC1ZA!?&*X!c@M5R@+l{ zIvQdly&&lIdsz_38u+T(rPYpx!B7sie}!YWU>G5>N{_*r8u&;hGAzL5}dAG zkK<%5myYOW)EVI@^Jco%CB0i6-jXd$04UVbV<2V6(961kpZCPF=$~p;YiE9PBCC^k z>(CBNF4|(+-m@s_8z5utt~S>1>0cVV#;%DLw)vjEtD}aEWxfu+eg!YL-ZteaJR@Hv z@(M`XlBe;f^qan0Y@X5s`8wTa@_oANZVcp&P>M5Xq#j5eYp?5;zSEelw&)_;xZN(% z$@Ie{3IbIQNY+O&)vG~pZ4}1E9TP$(>1AY?JxmO3h4qWp^EA6AQuR$fFWQs*-k11>v2gOD4 zP)d;{%8?y8(5i|sHyW%}RZ)XhxuPCf)4f1?M z)p^IJkau5eY1;h;UU8jeQI-0mB-o5Xk*X?Hx33p1kC=hzCF*Dr-VBBVNFXpXh$dK- zsxskPkZl5PS)Ao@ykUn9Y;utfP@8r=g@u~YQ0S)?7@4`J?RwBjEr5P*Sz~VrW_a63 zkN!gYHF^+dh4j-NL)Zcb9B1rp+j1?h6}?xfcfU0Dh+FV}OuWp6uJbc1+8)i~jR^-z z;MpjpMH;(|8aa^M|dWu%*2y5&L`wrck%Nhljo`&PVcW93J ztXVu$8q-`g!o@qN1q<0C+B`r0t0F;<+kZJHPvZacw&i2n zvEjGqZ2ao%*(mEO%$8s4r9nR$+zis}Rt!@i{{#=Vm*mU%%cY)CXaP-sF2h)HFckv% z$HZ;!bEooPVkRC*{s>)^b1jEy7ahnV`ECl-f;p1!(KL1H<6XneeQb0Fd80x1CNBYT zlH|H6E6lQfyTtgA!z3+be@6LZYVzO&oxYn5d1A>*It34Y!C#<5`Z-A1b9DMO^(tst zUo@Lm6SX>i#DNY2R_Moo_|WijkOVkBHr$*~_q<0Aw=BK-WasEXm6`&lDovCmRkewA z%U~i>DXB1$nyUy6&A}J(!hAwQG zU6Y*HJI0shuDk2)LOSnSkW+W;-81n7cw_o6??R&Q+Qc81#^rHkY~6EqA$511ah1oe zVQfMRbN&8PTyP3Z$X45xA7UY}eZ#ny$v;J1k`o}$5cw&Qk3ib>{K#n0@?#>O5IG0p zn-Bo`CTdE*ru1#4U-#YZd?S3f*b%=%2OkXB{ZLN+KZC&xzXa2JC@F1G-EHAOJ$*5^ z2}{o$M`0OYd)`H7`9~n@SmayOSS7MXge;@7H{c;XYZH_WYuedV&gbleDfj#rXX)r9 z9Ss`{`3}_%GZR0=jh~^gHv%)^l2};}K%Y*`g~8^+cw{)nL&r597;gQ6@ty@^V!@b9 z?pjXsp`_6|CV-p=J7X4{F^A2DEgW{n+{h)EVOokxFfe794U5_^ zGgefg8WoBcHmZWm&8#VzQ@N|~anylNFE50*BDt=d4m;vA@Ne?!azb@70ra0rkbMVj ztdk&<0br5V0jxS8Q3t&1TP_NWg3#}{=u^rFpy6F?RlqaukRhmMHMEIWhigY8%{|9y z6fPYEnE~&Gcmi?ZPBXrlZ$`tkJ^w6j(y90hgqRTAJdnsuX3pG>ZUxuFcogMKM$3Kd zQYCT&>B&BO3+xLjGl_??f^k(6Is= zv?t(pb;viLTvVq*Po!UhjBTNf;EIibZ4;c^HTtl_a4?Orea`@P@jaj(oJ`n%2=yIn z7l#d3^FWWgCOe_b-7tV;#yULBLly__Utn(eXGH#z$QK}O=MO@d{446Tuj*4Y`e*P{ za%ny6T2;5OSKd=-F&h#?cOF|Famt3L?<_yRg9qH#=fH_J>RC_qGvZw$@=GENM&zCU zFNky*UH%@0&mD+lTTcjy9LAdZV`Jr66rx27PM(H}r|h zz??E1SH>SWt`$)|fHrlC+!4rJr2N6U+!t$hF-=}WPN+^sl>SpX+c+0L6(ITZa8^7$r$Ea;NOCsm2=9 zLhavC(nXN6O^z2~jq_c@@{IC#OV-Z9A@8P2IsWxT%F3)t6E`Yl|Kfo2M3U*EoTX_t zK^RDIqJS52wP7uUzM#FQ?E-jqos3ceBojKF{I3>vVQFB0++_+!C?KJL11*FL&Yle@ zf&+GK31h+y8#@e6He4>7!KRDpaDDqBOihSNeza^)uA-^TIO@v|-4Gz{at=2UF=)AR zgUC4|jHQG*O7AI~cbLWHFbMh<-BQwrvfF^)&@2LcfDeOE`B4Kt>rQiKWB)&>L*hOI zJ!D4yupdMK|8FQMo&F)H)107A9y}Y5*x#Y2XqOILWESa4{{v)#I9&vZzlZh;vugUID?$mmibjbxn40VMZ4?pXvQ zlQ+}lHFTP|4TlVz>oG5Qf7PTiCu!9L>SYfO7#xP3W2~REE1qa6VS$?GjX_+!A2QPM zOfEVu8-*85Nh)46qaA`o%FUU`#vwzWABiO;`ta%*sFagugp0gfzU&lT{7OH%i9B>} z6%M|La`MA^WAZLlxQqNQd1O5ME_qJ`_efEyCZBqqs6%!ep9D5lsY9_9av72Y6B#GG zeG=jWfQ+v9QY4%XoKXWyo1PIFlpF|l?TlU=ncqDf&~6~78&dSH|9TU7o(SGWHNVSH|Ug=!KjQLZaQ;xVB~Q*2nd64RLdI&(x7uzIF-u zB=0(c;4yMUKhiK_(HlBY6ISN#nUiPs3|$+;*+4w5L`p%BVK z*n|w^QngCradp(Cn7-%1cc@q%jn z`J-4-3hxB#^C!&w#Z{RfV3G98=_ZoSdELwFJ)vyw!3+22M*Rb0K4XTF& zj%n80%g-b^{3ZHefL(&#!JnIM6Si+O91diLTWk!9`~LhixjT8pU*2a8aZn6I`Dl_9 zaVAP}so5ezp^ESJ2SHbcB9B=P^B%|-lt+n-Kp-uUh_9&m4v~G#Ven@+l2RTIP|^($ zPjd_;QFH_|551ZNU&pYqjhvUK@Me;icWU4k>6LwVqx#UO5I4`@mmeDyPSrf7R1L;0 z_|3c8f$!@*5EpOTu7&=iGa)W^kEPf-&d-Z<1f3?=`E1$)$br zUtvy;hm$Wjs|Qnq9OC(}9o8}TyjSsUsdpXURR?`ZSr-5Fi~3G-6`XB-VOJiVU3v1o z{8KL&HUCDJe@~%LVx0HoFJbKbb)V_^+}RH-76d_15(UbntaU{IDJA}tX-T=|1O0kEFmOHhhyU7=-!zQ>p~3uRWAJ0NeBCq*ZiEImIZMoxg)Fr~E44%0 zB%YNxp@UyLaZ)#QHP1=B)DL}4yGbPt!a&nrQcY`NP1Al-Pa9!F)0Jc)ZHCQsFojb(R3|2m#&7Znyx43(`UkGG~Gy^O`i*&OD}{M^mrk8 zK3xmfOheeh*||uq%T|r>g*Ob|6;fm!@p3;)?ybFX>-Nj9tfhS}lC@4h z9mGZNP9oOs_KNMbH-}qWy=?2}aR&_1+uAPj^-t(1+Lp3!^zuUa?+%JyKZ_IP+#U`R zvDr2i%QEx<7JlV=KkN3kFsi?m4Ti-{QN%nhV!FL0iYV?BQI{q+ZpB-I|D-5HM$5Hr zb>@0M8K&8-SjI^rk{i8D#8NfOU#zp;=dZ`PP>a8K=lxTY7Ju2ldy<7Wc<;q2_9sD} zeZSv{?@VU=EWIN*$5-OcCE67BkoRSzB@zwB?~m^GWx}JpcOWpYsTadD(+^^5nTu{b zOp2%!RFw2%E~HxB?v9rl3oz;xLdHd(c2Xa2>fK=h>2yf4^Orv!{N)A+VT2}>omEU; z<4joGBGp+ny7os#7fQ~Z59^`JUE%Hc+!GbX)b)A5s~_57^&`VJ!Wz_|uIh9= zQ-NpS3#@OdM~?faM;Rf=HDXJf7n zZnda^OKZ^@9(V3r;MTc;&vQ2=puyNe!}aHXzkSR?`%W*;O&>cMvefo)bFO}zUN7m3mn)FyE z>*K*d_IG531~&2Aj=Mv}d{e)Y;m4xM{>5O;QAW%U7}-NPz))~KqVuHxw`G{@fj zgw*?!$GF-I=eRx0c5`Lnc-ywTfLqGZJh=v1K98 zPno;Q+z%_e;$Hc!VRb)FhGP8TRnmy4uOUEQqUUvY8Ion?G59kydzzDA$_u}Qbz}`9B4#m zTe-J25-RVV_pj3tNiu#(8lmcA#p{S9iK4k(zd%!H>+&qRb+mj9oo_J5{+u!Pg=;lF zXKPTVFPOzXXEu9eJFU<3><%WLxF-0QzX|pn{#7UV$AN&t1qQkr10C)H13jRhzeD(k zzH9Re4}g7v#y&dYn(|0yH*eh9RExa~Xq*b(gEnfLpPA$d%_A{UBT41A(am8W+ky5K zK(`UJ56H#t6F%96Gcrf^9LFs@!ZmIjS%(g=j_p)Nm~%w96=>YDH*d>TEUs!ep{$5X zOm0`@Ghphok}O02?9jdTpzsP@j-Je-l@@-rqRap z9fodlku0fQN6Wv3ZqAy`L->;iCfW9(scpBka^jos8S?Kiy=}>#;78K)r6pQ(mbgLO zljfv#_a68;iKt-)FeI4J!eqZ=<0%)f$?wn#-=*$*=#&Lrd78P%@6(hYP)D{Qe@NZz z#WRbzh8q!Deg+*Z!nGffMUWl1cH_~7bCPCs&`w+vE0D`z!*f??yRQT1cQj>YM@%?L2D}7 zQ@T5ab*Z(OzHo98DAKoREq)us$VBX8lzxeHcL!QLLcGJx`>=ahyuD(8U(SLyk2M@( z2jxPbZh0S2#WtbiC;bY%5WS>yKkpDl!C~8*)BNdXLtl!##{u^8RZI%ccZJ~5Sj(dm zj(ARCHF}dK{|+sun8tR4sk?VS_gx!dKw$dheimmEo3|eMZ9F!&=FA9wh8w>@dzxEV zc)~5v>?aH&0F$o#eROkfp+0d7UlA5laO{hU2v!XQ$5kHa02$#iufa*ww3Dbq@{Q8W zZc~J86TNlM$yfOe$mAJ#Ut~l#u0P|Gg?J;XNl&6e}B` z8+K%zZHaJENAYDIp?8%lb6ZjnBj2Jo60CKP$Ifa7UfM|BrCB6a?RftML{9kuffX4Y z%+72As+usr;EQv130YcAr}9+*KnkUMJXf{pzG+mMMBBNJQA;qR&z)RkS~S*R_KzT@ zK!!vM?wDzaax_GF1w+sS#Xd4g6@R^eU>l>6ziJ#-5M(3RA$nvZ-0}97@w_wg@Vqzj zCJx+{zr;J_d(=@-rL06`^6zO#w%ra&RHKc^f1)m=?j>|#U4LU<9!DN7Outi~Pp(l@ znLXv`1DJy%En9-Oe?!wTb>NXyTL86|8B9T~l<7REwHv2Gg0G1N$k^m-)RB+)n&99j zrW~W`FzZyfyhONY>o9>mVh;Pvc3YoHdg9Bm@lUurhmGg{dTiu%U0P^}I#AOV4GO#Y z0&fC07b<3D5rv6W-V#mJscg{%zMcWST+%2(P<_e?7d67RfUxJ4j=OH$d=u&O*WxT; ztj1Xik!Cmx&q`GX!aGzQrlAT|1)Lps#{=IlDhOzTBbr$yOq_&wvzU5jf+SN_Envnu zPCiALLJF$fvUZ~yNtyJIYdo@noAMVl!^_L^lKcl^X-g&yomujKVw6+rHY{VKoyUpk&wIMXGwr$dy~3$^T(qxpQZ{b6uO5mwGu7 zwHeexR4XNKRc6kp1EL_httuT#fZ;Qws#0N(f~Yh``BRGhUJkd=nMYB|q^BSE0D}TY zQkEv}11pjIA?z!U=g0B97qe!#l3}*`@rJvLo@#t zcQyLD2qcmD9-`KV6&5w(y8EX5&fXQwwYiVEKLqVPa2^`K-z$YX@_D83fW`p|ApQv9 zhH2nkR4@xRf8I93rW9#^AD@XzZm)xk_+X;&>!kr41ncBS*YBW|cCc8|X`JuEooZHX z&T`ffx?aGy>iY6sqU8C5?~BW?T2=Dki4HmN4Tb7 zyhi2P--a(x{c^oD(n%f6lDgnr2OKnyh8DuC^|=+Gk(- Ef8hPLpa1{> literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edc062cef64df4f488ee68b8f668406f486aa210 GIT binary patch literal 7284 zcmbtZ&2!wwl}9%|27|#2heJ{#WlN#I5>H~2aZ<@O=Cz5 z0nDg@p)A%ghtO5_AwHy1TYJets>((GLn`|R>}gM%!YNg|l`3D8%EhM4{9a?eNYU~k zVD+2E!|O)l^?Sef`mtZHI~uNw|M|b$t6$T!|D&7v%fQX^XxRqSG@*H#V1mai;T}(P zPfrZbV8qj7(=+jF#Aaf7mg1RlC9yqQ^{u#?IG&^WmAIC;o~!zHTu&NaL-niiLelh_ z$)dNIw7gcbt*L%JzK}fRJ*4`L_~GQDcQLu- zT~hZ8@mG>{Z=GqO5t_Y6sCD_8<~{m~CYoaL15GT3R`0Px-FqCImRJI3Y0P;7oMo{B z&dQkcRdCLU^WdBxbG`=7s#pVOZOnO6=x=N73;Qp^BB^ z)K7!BFs}^hs zP`r@mVM>#>jbiz7KOQFOn}H1CIE=4EX&A_&S^i?4oxXT6$iiart=Hc_st zyrJ@TkYJS(-E;5sJHhpd#1E6}p%D01Jb9cJg*9Y-=_|eXgYo!#Jxf?&rDuyuSQU2Yh$`K6ZQ+R82ZmSs zP%|{og+8|9Z6D0W@t4Zplkn&@*V+YaotkGm9DA zIBs#y3}zj>++{WgkJ;ectigzHfR=yur@T73w~s7~`UZ3F?LT~m4{S^~tUlSWJjlbW zu<^m2ptBPejfk*}d_#d*VaYHb%CtivYI2dn$xAMyW!FJQ+HLs99ri9iFb>R-(POYT zvQ=gKJ+f$HTc@D*F@IM-unsB*_J~8edH^fGZH}x_Wn_=4BWI-Fu?}kBS)-bouZH<* zuvdNSlVe=K6&-utuE$(1&{`G z?_h`c=iG2Vvny(+jvL%y`wyM97vu4F+1A}Uxl$S=AzaDv2~vkK^<5wK9>nDnY0&}* znPiO!8BSrsusS=1bxq-VQF;BH%e0NMLK5F4iTc=nJ7FCA{@ho5l|<-M5Y5*DFZ#O^LyOjM~2z@Oi6bjc;X^+pey?k$Q+v2CTK2z=b;5ew-lnA z!UCdKfaP|N&>XaDh^laame+H|+5mYTbI<;|^PF-j3#l2AmjSp_vSZ$4q4q$p~n ztLz%c9E$TEY%~Xgj+g*e!y*Y2jochDpz4fc<-l2?3++%pFmqTd5F0Xw3a1r<8e7*S z?R(*3kJ8*Pqf5IcFM_Fdr?g8-{R*SwcRLE4lwlSX1Y;PNl%zOs3AsYFhZLos`rToQ zn|@m8V79H&fM1{~r_fc-`y%nwCQgk~y)_3vBNuQiYYG#b*+<-(g2s_+!BClVyx*EC zcnS`syF85f=DeOAz}s&#!hi$#{%u4_<-NpbZ)@@&Al}v`4V6D6 z@>t23c_%^OoN+3Vj(T1HB0FbCiVXIh-CcY<(xDP-@C(a0BEKMSu#CphrBz819kde{t(Gpi0)lL0VdniW zB6RF-&35v>Luf-Lo777Is3Uz@gE<>LN<>)CB7lWCS75Mg z#1E-oz3-`$*U0?woQ&C3#te9+SKnox9uBg0{ge@Vl@y^q4W%?8RaCD8w?etqmr2oz zsq3fc`xqI;uv{Zf{bv+TC@5C<-%`3N^dug%&C;k(+Ot*^ zc@6whN?EzGcPJm0Z$YhA`5fTIw_x!nBTw6qSMa7XKwYKY76^jb%nzoE^=uhtkpTS5 z??A}A&<#W3k1gIm5x(b)cj`C4#TXTq49l6O3)Vf`GPnazvMDT`KdrZUf0Ey$^_}Gw zui(M2Y5H}LQ{iIazHmX(_qhmRB3;=#AamiOJ_#2ONdD;vV29PvS<`R?s0l|M18@is zE`o=vf=3ioWkrgP%`#Cx@$NPwKZ~hT78Cj?79zBW|W^SQJnYF2$ z%+9N~9dMAj^vpZtb}l&e548_jms8&Jft@$-eBqG2q#eRmAkoaLy+x7-za@0#xJ%>6 z(xGg8auEpA@|Mstiq+d#)O(c!mTMnrAF)IJD}I9mWNEM3Op?Npy7P^1N&?lpd>hEm+rp*EgXxo`FyI+Nk@ReIs_9Ac1T zBc=)AcVr)#GXh#ABdZGNg?F4b#fk>5T+X-Mji}7*eqF z7D*{Lq6F*rXbS!bcF1j-yv3UMbv|D;=7Hy=o(g~&rO}bd3U|74x~ony?M&sOB_z=Y zPcCu|x>u0&6Lh9cym<2_{yj<3CfnF1WEo25pu>tkHb{{E8Wt|+fJz@j~M?(liOQJkE zkbI5^;r*8c{Wl=;J2aI-okF)ILN`N&e}2SG{+VI5K9w}$ixKWOcshr0tA9VjiMlF- zG{QQt&j=foREq`C1o|yhS#=Rblr_-`n<%dtVG}61jQ&d5N4bQ`YNofSaIyuQyilyn z-2LUgyc`U(Aii?-L1;-xsnC+NIzvmuN~(|${-HuLO+jF|J+o&KUiv^vBr47!N!AGM zCdu0@Yo6r=q}igD!>Kdue}eRbjJ2@JYE#jO%UZ#>DkU3uDBmWDN>&!cL{MuqJ{+ew6(wT~wqMsx$D5?`X&t|@GzZ+W+rKa4N^2gGRDVG=-cj%OCPQIPV z0M93>zVfsXjJerW7BLIOQ~&uh#1wKn6-tN~p`RqoF{$d&6i z$LpV~TFjZ^Mk$U`4hktnE1_Uso|jR}L}_=YsCMYsiL*IARKF5YC-<6E3=pbxiyBQRGf8QIe2EC9#}mMgh|wdmOC$yUi$lieUa8qW-GTr3O&OiThi?p9`k;Nr~i+p4gjYR-@PC4 zolkjflI2>4bHrkhhdz1=^8>PWd3W*yhOLQuzc~6`rSmr((v+6=wZg~7bveR6xN=tg=0itCY zU6M1E6)D0~RpU5&+2hqrpqbFx`}PCoFVn*5KPjeppQNnOd*-Z>Q{iDA#o5O6Hls49 zszINvsQVC6k{NmHXT4+Wm+B45<&+0{btRjHC{W`+QKbc{8UuyM!a^2}1Z{l;07=7xuH*c#L|EP6MjsVk|8q_oQe=bzzsSGS^Uj-UJ7^souKWdZg7p H_vGII{kp*) literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bbb82815d7ee5378bd0322d48d7b38aeac57e00 GIT binary patch literal 7529 zcmbtZOK%)kcCL3-S64TiO^OfsMcJ~)ZI4KMCX0#V@yu8r+17)WEsZg#v8P<>TTOPe zs=9frTC#{D5QGA31o0xP*<>MQCj|Ke0fOwa3$h4sR{>@=$|6~06L_RizH_U(*`z6Z zf)r7w?>+Zf=bp!R4~LD0tKfO&H~+SE?qx;!EmdY81C^iR&E8TKg(<$mRHnshqWM~) z`+8#dhDtm=HhmMnMrGS> zSzwFcELNPC!8yg2z*(v|9dMS}3OFkj=M`{P*=cZ2SDaUweog5-wfB?v(_X}*v>Tlp zrt$5ycduSQcX2HlvM65b4wF%s_ix0}+O2-Rz4qRCYpb7bT?o5iME$MpJX?Q2N7K=T zak-!6!uep7_lId13-kJT6i1sKRcL7n>SE%H=Z9&pzlB=k{d6?WuS9vs!aStrttbz| zZXWcgbMtDr6|tY?d6d$09YZXhAI9S(y&Cc`j-&W;KaD~zTIDb1*&edD!z>c>A71+S zSf}}qhPRHgu!2MD@!=>!*}fJg*ema7V(H^yH@q?F{qy8T#2CJqHdHNq<-I6m7<85< z!2Yr!52Un$QPm~5HRLf1vi=@T-jb7Jj_Jp0;uMQ|;W*BNQiwr33|Yj*>UOW1suFI{ z&m$h@LymbG)h0g}=a5{Nq(1vdlL6zuKLUvqUxi_4HI+4(7U@hUwOUnr#urKt2Eoib z4c}r`WDgu>OX@^5R*PKbQmyB*I`i%re*Fu@P<#)H(i9Ck!0C}}3acv*Q#4|E7C|Bp zp{vjA%0`)9;${B{k}C(wUG<)#Dup`G3MJ6-Yv30z{SNfg3TkFJ)4oDE zuyaSYU{uAW(Y4$?Q1*4|Tj)y)(|5G&M;Dd9P)m-SvyQUKj0I)i$h|^kCbJIo?2_!I z4jRm^#$6|l?BTH5fxd6bG6aIMl{aY??4VIt1^E_F?rJV?fiG7mj05Z=m}d%v*4VrN zoic9p!;GKC=W{qLWeYE!fBz@n;=?@Tx353YP!}#~V~}sdCi<=Kfw7!~p9iC1)+ZAP zqKPhMePs<6XV3r|9^iZ^W?Wcq8uL*cjc0gGhh4{`zYh5NC956n1yuVX}>z zahzr9TaS&hIsQT7GER zZPim4-6{PfkR5)AR1LOf)8vhvcG6I1qI=>~2S**(di& zCMW%;uy~Y@dD=Cx!^uPE;ov=wH@g8+D0h`YyQhAl?VI~h-+@Yjlj&b+pXmGczO!FL z53NuOyKwI5=mlOuzwSO%`>sy>qE@&?y)f=s`yO~k;mP?t%tv0r*nB{?{@^hlXuj3m z9;Z8*(4nFolb=PCFeQ&e!4;0^uJ9K@It|{T@;Z?XA|!DBI*6}B!-N_Lbyui6LcQtR zJEatSdm@fH=}iFDgHZVQX{03QP!e`;$Zv(5{R)+g)UIhS;$$3k5vORVd#8?`MAiGg zx_Q4rmXd}^6a>Nz$TN>)Dtke&I}YQrga5<=jqru z`9l6<>d~mQwHw885X|iP73xA8=PMv2kBr3TD4Jm{0 z_Itx>e=SFJ(fP`2!s80d|LuiOB1P4n72lzz<2OD9*8Pu3Vv}dpya}f?) z84l_=XiwNAw=0*gZi@MSiZGQ#tPd?MUBlFLy9CxI4~dF2l%$ni1(^v4+CYP2)(Zpr z2Z#R^IZ-2rs|9CLw{jb0jVaPCJB2o=73d)yt}7j`4!j{pQc&}B7cIp;NllTF~b}3X$9+D%y z=kaE=qO@>TTBN#<@OoD*P}@fsB)gvS)oTiW3*9?9zl0xtg~;WSAbc$%tPn$X;2zm@TnlY9K9G{et`ybU6B=*dau!fF5f9U^2KoZRfp!iN^|E*f_5 zW{V)O2+Md#7NNloETj4G*(FXrx_FN~bkOwhWe@L-;qpaf{YN>Ch(91O9{31GVvKmpl9Nv!j=8z|%9aAw#Q4TP&A44yVI1OHxKA`lg zBnzP@@u*{z$~rYlzb2<9;BbdB)CO+kaRiVe% zF?0FZzd)(uJUN^=Jj(hE{+qvvp1unKog6u&GX4gAvLz71cB^Pl=4w53Esvtd{GnWC zm7o6vLr&D;576+JH0fE8V_v;}qF1L8Pt<9toikqj{{Z(RcU1x8uOEZ_p7ikzh`X|s zD|8inlBJM^OkFyGj6+NLl90UiC1G;u%2C4M4h(4xu#CI4FO)CTo<<<#j+47+uOF!I zC3J9+*T8?wc{LvvmJ`>mO)dT4YJtdr5ja63EmH*58yc zGkXh#$q@b_Ez&wP=W-1)WxpnYytKU*>cYKAH%=Kgg?LKKvRDC#&z>*N<6vY1Y4_UJ^?@kJ8`cmGW^-3-AZ96o1wq3}Z#L9*z7yZouZ>ObqYdeuI6O!`hiNP{Ar6!9l8PJ0aLf( z0!%mo)Nq3`_RZZAUYazrFed@a*33K2V=?YK!W(H3|2dH<5`1#Bqqy^T zw67C9MQh?ItXcd=BcALj{syfxo?>N!|Gqmh@FY*c8Zvdzj2iGEM${w>$>vxKNO7*F z*5;9ZSVi8d#TL-og11>j`BYi%Ic$k7-!c7p>2KQbH>ZUJ5*LR2d^pa+`0|xM%Ih2h zlH@xOEE?Cf9Z0`ZMds`96;DBAxSC9{|=sgb(_lc143p*|^)qGCYvNG?)1M2V!k)x3DU%{H1 ztOdpNDzn8c@=?p&TOoHeBd@99^3O0*)Fzk7j2{}qHzG7e=V^o{WL27M8I$k96c5c$ z zw|2|)(VwC+%vp5Oc3aNrQFD z&F!^hF$uGshy{{k$_CLmlA!H4!eISw?ziaf&t$?EDVT_7{1j#sHjhRza^u2@QkGH1 zoaf+|7fP}$#0z=9N{KrpG9t1I;xA=6l2VgdvmgmaBg#D2DXpAj(35U13|iQ4$b2!Y zvd6-eJ*0B^^=iYipObAP@qy?qOxdN}%~9(Vp47c6;ru_?N%kioj_EmYGOG8urD?e3 z5njMW0A&Ypk)Ur|FUw@kuBQFD@gI9nA2;dQsjD03CQmT^W@q95A2|u|al+<21`&FO zLWD#3*PJ^$T9Fe|ILubDH5^3&7IwaogS z{GrwUCjoJervBl#OkINJ>3>`fmuS(>DC+vJpEP_rVx0HmY<*gV@xhMUH10(Cej^-l z(msFfq%|rNlq+yDd+8nghU}X~yJPxiTdBD3P%8mKVF6tN@R#>@y1L7YibNDry=L@T zYBuS!aNdNzU=;l$RLO{M%rB~r=D|WeNmo?Q@YEL7kdVf!we|Lo+t0PnEWPxbb)V_^+}YX1VgV8aWlGSO<+UV`iWDbRQ7N%#>%$@;+N2d_O3q}sGXNHt z*jRZaUJD$GA73V(r=zoTnftOZ)E$0F4;5s1tPjLZy7 zop?rS1s2|BYGrm{GtWw$%ne-5+i5NH0*~`fTF?Bz=e(OXvS!fad@WtbT0tvY3>LF? z(9V{ECFXnSa&{&-!})r8Hd_f+IPa(DvejTUJ0G0qawB~v`%ds3&NtHw*|Wj3oL@+v z%hrOm?D^n%F1OMbvKNCFbuBRyYv(0On_^W9UVdMT7vuIFEp8|F&UcTD;1zI|;$?7_ z$DHqhb0$6u&e<`i1I|i(4xDpi&iBDtjn9K~e$06_Hm+-(XAa)|u)mqeq~A?mAN14P zYwusX@%lzG_;f8B#7Vl=9c06(=-o_{wOhSnd+meK)>f~-^6KnTiKJ>(Z;Z1&h~JI!L@oaG zqfZ`bwD_~Zt&=Rw;L?0@Fienlu16V`D?37+{bbOMZcbYNIJ=p|F+Q3WG@khC2T2s8 z(@Qh})|U@t$Qu?8$4$ap1DVEQ-aDYdTRb?%n0~BKXX0ct8l^>8?PHh@qBxOib$fF> z)OdfxUXjSC7)XrM7%%eUQGuQ7(#Fp}3>+|i_cM@03v|Fk)bzL!i^PZx60TKk)BHl) z1T13fP9v~mJ8^d0*kS4>wYZjeu}7s%FRsV_9W$tZp_y9XLtL7wH9o`Xsca~_%jc;Y zDIZ6e@i}yjxrMpNJ4nRKzXmC^Bkit!Pt&zh-w~x28hD#{Bhh=uIIp2(6?Ul;ypIf& zoWecQ4w2h2O5>~+8@nQZ+i^+}n_r<6)(fBOc&0I*=4_y) z6epcy}c3l68MqMo0 z;MJIK^7@xz4SVQs1kGq7)8Wltl;4n0WE$koq>?<3!WI&q|N7Sl0x#u}jE2QFkkV_t z0>(-feI5=6d5?e*CKCzGd)gX95gm$1-VWnl7WngRF$VzBSyom)8baFKFzoevMHs53 z{wND)3)_Z(=fOgOang{gn4#+1y*N(#EZ0Gec$`%ZfPl_YK7;kjHZ8g~?InFGFYJvH zENM5n9qvad@(toz7!8Oudi^4pta{ijWRof<41A!h5yQkB{f?{5JV`g@Dp%-gf1~}3Jb@I?L5PB~o%LQZ?rbe8ot4t1FhRX;fQ*rAL&+}AhmHwbP0C`*7gFC-;DN-6J$;oc}ps~Tz3 zjP(7oPJ{$VS&**owz6-qrBTjDpIoLBl%!MT52;0CEX3U;O~Y_**{@R*TDv?4vW}F$ z1mbGKGQSZ*d}|xcZ^UaRauCXIgee{hOZ?un+h22!E*hVBXnpkLKLDAtG49iBOzhmz z0*CF0ixaLXZ@f`guUy-h;rN8nAU}^pk@h0L0Wvch1q2L6fE2li^{>b{u&F?q_!Axs z2J#}-prVbUhB~k0UaXGmtd0%%C#*x1k3;AeVP(xazI+~2^7czItCDstScIBNWR}D| zV5!o44OU>`c32F;x3(*#Hs-y>%x(EH^(Pbs-WZGUag+hlF)NXsYBEWBLh_+arH&WT z2R|o`(@2;E}C}?c@;1DF_Ddm!LVb*{TXpjf`$e3m*D5L1x?+npwgaO=olo` z_%-<>8sx`B-Xn4fB6v*s6RMp}HWT%aP>~|#Es&F<-ddJa+eJF@&>7PsS3uzC_~YeW ziEbzINg_)7NMcIw{1cFwh(g%NuMvg8LP3Y0Sh@of+lH%-@tr-ix6|BwYgh}Y+O&qZPZ?P{yl*@zyr;o z%Rj~#CrlGL1SA?P$OI1-WSAznZQ2fmxU_(F6Ps?ruxL9%zK9mk4rt*ZXO<3fF3gd= z0+Y6)LHNaWt+Y9h9%yTy*0yTb5gS17DYgbS>gQOE{7WL_h)a?T`6)=J_GAm?vH9U+ zk$;Z1K@D3B=L{w~@F;t#MPeZQ3!3L-C_Fe1kE^WD67i!a39n?Mv-T{Ydz5?y*85^+EvvJ-1g`SlPI339%lbS2hB9kfTNA=PwvP`w6HQAyG6mWC z+wA4$?;tbjD}PF}EzERGHCc2$|3NN&Xepu__@qt8!jQx_^z7lNaMz@Q9&AOx^yr zWxB%q)-_GtKc1=+KKtpHs#>M&;0!ZfRc+z|=j^P#rFU8#Uq}xPkBn1vg@!TQm|MHAd3FICk8O7f{!%>9rQZ zC+FhDWC0O4BU!*0OUN%*`AwH2Z&uJ^1+)kSw5leK&3glRIU40rdgbZ{dzdG@TLT|4 zN%^}d%rp=z7c>w=31N`}9_`pIs61Q;j)coPsS0&CN34BF(^T%$yyH4^D``g&?i$}r zsN|!y~n$eV*u!t~~R;$Tg*A^m=tMkf8v%Z5V&e-KA`4_A>A zq1TBoy^9ud2SnLbyl-+P;1y@62jySV*50()gwC-p7b@n`sSJD|^)ga_tN zp=ygD3ZfONosl`H?xToaBFek1YF)ZkB5EX6ox+hJK0J#M)0N47FGr~9gag*Vqy&Hd z(*HoFpl?M~k0NJuMXQeL$b_E!6cGUX(24cZ-q-RAu^7LhIrkBX*=s;=k`8ralQ}_Q zlx}SC)s13k=8kpfm91W)_eW^a8o9GRj9(N{5A^3$sb^o4DJhjH`}AW zl1NUZ08w=cRcARChRdi)a|R8LLC52Ww6eH`98s6=f&@z^>ceWDTU31#TFPhX83(W( zd~E)wXhp*5!cMmY>^I_Bh=vKexVlgwCWlbOF@)EEqk8NKTaYOG!u;*QbEjZ(eugoa zHslbqY;@X>8Z(2ISSLctOFrF>kz*zaxN-oZj2uzTzop_6ubbPb{coiFJ&?*j=BUcT z>s};tXD(Amw_6^GT2r(m5aWWBVPvd|JcGQ(H%rqPLH~fCl|4gz?hT z+D@*I9IXwWe@fHp&w?tn!YAX?=a5PXUVTcp8O*BxWH?TI`iG9!KQUzsH1wNK8M^Xs zr~luuNuX$qoSAQ>(gRR@m3CnpT573nGs2n7p$E&$5>JD3=C{m z2|~RkQel#KUcM! kl!v*vOgM#+_nDs4ebcA7w4f_Il4u+4ciI=)m(F(n0M9ou9RL6T literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01d0ff640c9e60e11ff6bee6cd364b3be56a9342 GIT binary patch literal 8707 zcmb_h&5zthcIQ`f+4w z4B6dNWY0(wcCy$JU?+$U0RrR@1dxsja>_l(EyzC*1gTsC>uU|bDtBO_c{obp`UbX5dxX%66zi!@tRZ;$vD$|#N%5R}0>#CwKg{hGetDYKb zo)+t#9vhw!m%Nfn_q53LO#JJS8C#wuWkzJjWv?vxQsl%HuOfLfa^tF3mAn zVGY*2t*~ZjZMP0}@4Jx9vjs>N3dsdX7TFRcONHcRNKUcSken_guRwB!Ekm+gNLHD4 zU1>e_fL6Ly68c-+Tf%s!(@uqPrIVz>zSd7Wy>1YR(kFv{6t1^ap>?~U^^bU%@}ScV z+2vk$qq7NF_5E&tkX{YbfCXs)fpfhR4c_XqtAhx&+DE}=$lglRuuGjnVtst|`g)JY zEko3=psRr6;jJEfJ4iyYd^O}~f4LV8;_gQQ527fH@>Zhp;}1VQDf+m#e_VtC?9DLs zd9NSRVAq2fufSWHSo*Zr4sMKEzZ>5O8S8X6X@8nyylDENxil;Kg)5x}@$4@YL-ESkc~`jacQJbR%GHZkuY9yFDq9`K!mfNR(df{a-3Ry#U>F*Oa`u-hIt8Zghf# zKaHAaBv8PE)4$zK2K|1Iry*Ox%sYDlDc}!Q4$iNxTkv2V8TceT{ns^uJ&*7=P<|$NP^oomQ<)dzd-jTd7187*pK~yvtLCeu|aBh zhni}r2Mfn$Tr}gaph?T2J?jQ>==;L)$>;_V@viUh4uYt-qhqOro}`?Uw(=?wS}+bb zq=;&%KP1hc4hW5xTXin0`>lR;@oE{gJvW%H2Wi zZvh??;S#&ESJWH=T0h+a{tH*~2{f2+^;8_(^!q(1FU-ylM|hhXDSrm<-KISqU1cy- zCoM}>;qND?0nAW*E!7W|p$(i+d|<}5k=aX#2zIpO)CbB>(Sjt)gblz8P!kd}vv*2- zE$52XtJJz%&TNtvvhR~DwSYhJnPugWMzLw!ZA12123JBUFVF}MIC?&>RfuEyJl%0K z!n10I(fwLFCwri0X~rfqGK-b&(7k%vD5TVfTHK*$o9R6DCyR;RO3b{2k^O~q5i&Uk z{JSVNvm|;hNi4M2ITlTMKx7jI*MI!ud;C29TZV^iN&|k&XPww{r#D6zH{f^qZ%{Lp z|2qAh;?LqQHwOt^i>P{YDSC+fu^#XrkhD3Gj#WZO|FS1xlg7}P!raYWiG^0=Jw2;X z6Ml)tdXvZ;h%okdclJie+T&y=Vl;W^5GQ9Sj379;1bwM3lj`?ix^0E3@zEcmlGF{+%II z`fiEjSvhmEN@mlOB3^~I-|xw2NtpTn z6i7>aCLBLNNoa{`)(W=ORu^*bu{>i-3LgT;^nEeAeE5L|I`P@LT5#F2;+{BnCsn4icvQdl2;dYQJ96V6F5!`~oomT&nUZjf2XfRB7j z?yrrXO?wy(9;Rk$4{`S^)7E~bm23Z_T{7T+jS`HCgh7Np-L3L^$R*0iQo@&EoJ0 z@;Xz%8&(Y;+~p zGIOi;=+y~zPa^RQ?EP~zOUR_0k`0U{lm)^bY1#y3CDMMSl}8f$k!c#LeYB(uJImCt||Ve>HQFoc0^(x=fyJeA8R$!8#Ny2?P|ZUz(`A?i@;LOT%jYFX)?29%Wd zfUgLANspL%#~4C45dI!(aaWgtv6ODm!CPC$F3*@8q&F}&_lc0X@;?L_E$fU2IHN`r z02$-n-$I_y7T8!eS2eJl_7RXlx-CsLAJyTwZ1v#m%mo~6zKo`8CjpSW&B>UVyeI!1 zB6Qlz5sudL7r@-_p~;`2Bs6irT*-J$0B1`0sy))4J_%+v;I3vhzvA>Htp%49l)3qRsdxLVgNVh9s{tNsEyX`;#JBHtn-P^%dnO(YIG4#M-2K;wm>Gh z^(6(2+LzKd`v(3g%qyiPaVSsHV!7%9rY)_7~bN)KlscB0*YWBu%u{ zSEW0?uT8z=IDit)r3e|Lb}#O~`7hKjX@{w3ntfy*9b7oROk>Mjy(CY$$(vwCPxIfW z@fuSAM{DZ2eZ-GxZwJYV4)ZFgQOXAVP2ucOIwue7=$-O&N)3&t&l(o>;$WL_2$adK z5)ZnYq1TXcY@YUr$p0k5%>Oxo9uGi-L8y0MKZ$D+$SB0lPlC^Z`)8EoG>B6&VOci8 z7=GqAb;H)2ZU5}*dVD@Vgi}M{XPu#Q6P=vsWP}==o_SaTgds%St)UNcf~Ewu5`w}Kv_Kkv ziEm>Jz5^okDD3ixsxe5*$rq(Z0troh;;s<+fCxd8P&=X|7gegzETsB(G)tC290bhG zB0~X$l1SxzU5E5w>BP$YU+~GGxlKTdg2bO6hK0EMPbkStAT#i3euPgZ4*cU_i=Jwu z9J|KYbQj_&Y&!k_gH5bTScPMMl~RFW4S|=*=2)E$`en7;K(g=*(#Ul-kGpl?+XDE- zoZqn75<7Lf_(d2B(eEJ`v9#uvcz4iOnUeL`H5?psNo{N;2q znsxU3Z^#p}cQRWOuCk8bB(9i#`7wI&89IN=NRLGmG`HMH^C;n-frw&kU@9Nr{>a78 z=r}i~87}^aW=?2Ji9G|(ed5fI&3a@kWR#*I*Mk6be3+!|8eL=@P#PPi_Z7ZjMHKZGtX%wfFVoC=e!Tjt8~Mg$fOf~CnwBbnEB>7@KN$b+*0u( zlLPaV?ycsnaq>sLQ#9qq=UG1F==(vsCCY6&ULo1QMTL^?6vV~>zGm=nuah7Jb!?Mp zUugN{yZ!`oAm_RXL`-SBkgGmafu{)h^KCmcfTw>4#KaMq0YmqY0Nib4B}iYwVY)1j z%nsx#e{uJ0W+CWvvhqE&2X8Rrp32|AaoWWFH*nvSnCxcmc4=76s+kLPt{mzrGUU7e zFn;1%@x(QZR6K!PIF0@Hz;$z2%W5#6+BULY!#VtysSduL*>@d$2e7iaQ%Z)~c7@@~ z094OT-R&yInqxS@AsJw^@*N%XZw#Bm`K)ob#>}iywAO}=tSKeUv80*JOUe9LGC%bW z4d&20$oFY7`96}LXD-Qi8_0*@G?>Xa8|@;eV~1K|KCZ!Rnz_M@O33OnqwWWSSc@js!iWWe`1aeqLBa7P$>op2wykF5Y7 z4rECD5yT$4$VGIp;HN-Z3%Q;A5eYIPLn40+;w>d9^8BNjBz_$9`;<&Y=)(h?!)UUI zOE6ilDnBi;;zN^gWD98%UZr?R+0MrBoFQfsCD~;11|G#4o4hyZC;V+1UEAM9*+JPm zX|Arcsp8|+JQ*#$52YrI$7H%Vza7~)V$@{tK$9Q1yyu_Pc9epU$=WA2mAzAxEu(Jl{1bZ3zdPm$Hp*Sebg@hn zdl#PY*aR~2&%H9EJbO)wR~-cn6he5fKcVaV05bkGJpRJL<#B{OjQ85C$Hz|$(oU4D zj;kKH#

X3Rh-iQ$q~@L1l7Xn>P=umew06}+8UX&n^X0)>KYVUp{^0FYfZiRbaTCV IY3aiM0t9sg!2kdN literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf43fc9e305b65a985361d5810a944d68ec6e2dd GIT binary patch literal 9430 zcmb7K+jAUOTJL-J^jsv3WJz}7WE|%=Hs8$BuK8 z*!0ZU@+_6g^vL#X{2GxRJD$TlGb+Wd=W^PL%5lZ3aN3Tlam}l7+KHy(X>Xd-rD!Iu zdv#8`(QG{D&2hRM9f{|?`FO!wh!?%Z_^5Z3`IYEc{D}7mr>oJU@p11sr)$w;@#Eg( zal>oGC%hBPn~I)@PkJXgJsmw6Kjl5e>6z&1_!;k+_*w5+&ex-6{G9iks)SZ(Z#_@% zx^`6YUU*vxW`nt#N-!5XTQBbE-V!)Rf_ZS}3(hh)3&A2div{NtI7fqH;2bMBFM;z& z@F+Ns7Mzy@{i4!5zH|EBWIYsN(hgtjCee+?%1UeXOe5|FVbo}M<6bN4T#mxVwNAF# zc&ERy(MdMWx7uKYosG>bUAjlx*EFSmCb=PvbDef3jg?NCN#{Z@>vWS=B+ZZdy(nC3 zs!~f5(DF*uid$#9$$Do4S$C!T;h9mc`fk$eXRBe>3R+o-z6Y3ot`@_|XYh4jH4LQBiNy3&u4&x`K&ist^cu80CZ z?d;IN(>yTxAFYfgj|Aa*s~=^4z88MfZ3Ur_M>p4tu8KYKJ6R}NSy!N+YViRV`WcqF zO`A9Ql#5LICK93IsW^jLNe#+@7V3eH*9eWk49(CwssvV`uDhQr>uO*J&dn-LVkvNM zDxUMXVklk-@>P)zZ}#fS2Wzst*$IL$;ZKq?odmOr!=Tg3!kpz69xtAi~jd)VaLeyMZ&4yt4GtspS;n@?x*mzJimM zE?%BZqChtL~~uVI!HZg0Ei02d*`B z@eJAsYABusX&T~r{7Ch>XBR6fRbMPoIg@27wXq%7S|V6MCUrn+n14+*)ScO)5 z6pEWJZCBEYL*JLKPiL(kQM%&$+x=Ell;{v}si&C`B&ecBgceG?0wSx$X0*d7!mf{} z^a53(c?nv~-^9zS>L{9Re64BPx0S`*eQEuI3b!B+d3<@(iPK&a}3p+ybd(+srwR=hfXN-kcmXdz# zZvsA3S)t^Wt3}QwAojCO=m%NhbP5qB++B!U*Zp1>(u?EgheJHh536_*iy&)h)FF)a zuXPd}CvgJIHX9^J&_kz*B!D616ikpYqySJ9U(578W!D9WC_VsX%NV%xFhj0r>B4); zFHnOx#|St8CZHBL*1)}Disd|2)Lx?6Non8`x8VJlc$p1w;x@;WJsQQKakmWavxJn1 zw=_p1P+Mn|D^n`aXQ`wzAh4?rFuGsMrnm=cOEb0x&cF_g+f+84%@kbfQ7uZSZ9SW% z{`j@fn;Dq5F|t3G9RZK$fZqyUXHW_(E}Ji)QefGXdqg(y;`!Hq6esc1G(2ox))F`T zpc8wQ$?XEHLR*VBsG25T$B!Td)LalxQJRkpgW3YJ-c*KeB7YQx@>EbKptBUqND;ZaIub*Q&9Xr)v- zSeACHy&1|{hio;{(QK^k)6Rr^LY}`i4%H{%!43 zeb?S~b|HFq42YyXaPI180p5Uq-CYR!9h3M2IzHurao5^~Al|VC74Ex=zN-KkeeK>u zJTQ55?0VA{?_ds4x4o#7iStN{w~4$*A3m~2jaS$&fCDm=I zUXkim&%Tl`gl88k;+07Z{4N_;(#n4)z@2fO@c%FIrnE#gyMax0)VW;a%pcf}Lhm#! z@jIxzCSD_Yeyv#-zk!q>^&ozW2&rFbuEPY(a(zP(CVotW6fCPBoU(2=V!@Io^muzy zS|73LDedqJkw1v1uTq=lfL^w0=R z=p+NisKq+T3@qp*3tG+Ia-i3UmO_}O&|qcMMxN@2WlGUY+HSoPRD$YF!>h2KA`zOB zxw&#KY{MNAE3F%$Sd&(p9T&E$at%UDJ$!5=2oQ*u{sYLQ1|e}L9Rd>+!oH{P>YUaC z1!;rRFs_j{Ic+iDg3(F`McvA|Hl_A-=ojGfAny&|2aXlEw{dWME3=Ch&>lc>Ehw}3 z>t?mVz8Lv&%;u9m>WfC;V*%)1)!m&<_MFvCdz1NFAC3qgW0 zzOTj|oy~OSu4_0-+8Q?ckja5d$6fpZk$^}G1aQa8d`JTkQl&90jS%&pz)zn5aY|Sv z)xcUg4}mefqMB+wFdX8znxpO<{q73xGw^E%(Ulb0K{NjMi02XceIg8;w4nb7deTz= z9p3Z>5TK`Nd_$ONF%s4uXeSTiq5}l2+4ZjkwWP(`_t3EPq_lWrY=p~x03CsQ69h@+S@Dn)L@xv5Um!6C##>~xYPZ-DfLX1J`GF22Q{S?BbL7D%0Lxti<@Uf> zpt?4vVH{^|HfeZYj6aJ?)p;HiVz0 zwW;#{H8rU%onkZM3vIl1xj>UFOZT-1(KCv?y?&qiCDF4KO~dhjxbw_-o8kz;SD6I_=^c=N*R8Mg2$1-5eAxI|Vu z6{XUhVAp76#(6P^-gC#T!fTZek_ys>o(oBVgUmreQ?!ze(3@e~J&y)4xTey|e;J>} zWwet9VR6$uh`8iccrH(uXOP1!nFhJr{4`Ski8p-~#I-CKSDI!$bO5&i13)hFY$pQ_ zzL_>1&HZ-X*qJ?KWo0Q>44oil?2@IS<(slexX<3DgGirC0>mae<0j;77r44>Wh!D) zknlZ~T+3KP+z>^4Z#%~wnigAp{P)CD4J(h`D7wIfYlahAoy02N_0B|V0^F8qah5z+yW zGN8I&PlLNtpIEg5LBG(})}Xf!n5PE-v_zAkjS^!NC9s%)(pM<^y*B7NO8*0IN}7OB z6kGaWh@%#wBbr(JT3Z}qY0fEZOltf6IQq{h9pdN)eEVMkd;)y~Q1UmLr-NhuO7#pv z&%go43`!fSXO-NN8$1Erv=Jw9I9^i4uL~@#1#W@3-xDhdr%-DyoIVgQnF$s+nz9(y zQR0NNuu&}z>l@x2#(M-KoZh$}Wff6uc*Er4O6yATo4dTS#PKpNvWqw*Jo9(3I2>bxhUPRmO?l9bP8-Xl zw4*$9<4xzsXN;cSOzsg)L}$<`3-TXPb0QNOMbJ^tP3%ccLTL*xYm)0oj5P^`I()J^ z@>nlhDX>vUE0{2QDd=HCznmB#J`HW6Z#h3wH1Kjgf*LN@($g2RasmqnhO)+{b2Yy2KWS(mO2f% zG(d+CsTtNchVFbdS1PQg3Go@JAaMxR>;p&IE_xXpNm=CJm>ZFI4H%pwU{l2(qkL!q z9@23RVux6O@6neyXiO1k1GeAJ;z5C58k%_ogmQ)#i{gQ!2nNa2@F6)P_0E%~4d6*D zTkJx?zJ>cXz#UqS$GZ3#YVb}-E5COsqR;t!e~wbGoIt@2b>sdokXF8P9}hpGLXbuu zkw<7Imp!kEkQ0LOvd)r}hofNzvjHj8wG95CaO}{KA1Q+U;p*GgH`hO%oRcj9NEnt1f~J89T)I zXoB&F#GvTg@J?Ys4D8K_u}cgRjh)#dstKH1>LgeFbrEjw9GPIYyJ;RL3ih$nPBLVh zOP6u2B&+)_@|un`0^Fy1({!T{@WmhmH592JwJdEB_99%xN+(Q$lrCt+pV6dzB1b`n z(X2lq#-9@ToX9N@Z$8a%!!ztP_2X8rM^SVbg`x%Lmu49%@F7OgH#) z^@0blPOa-jISz-bX+JamedqXhPWj{*;u`u#G*H~c?^<*5fMw#%(@huF0mleN>S!Ub zK*L)-wtt7^-pojyjaWy=tRXBHy$OP&3CHo*wR;w+*&{^uh;SJ1mz4TmgNXFtJT2!Y z5{_cQ6C41;744(5TH< zr0x4bw~ZSxE4x8bl!qJ%O9g>wz5oO0`9pPwF&~cV|6o<_om>=lS-cZJ&})8| zonHY?(K68e0$=NT&;LNPF@)s*S|IA*yC%h_7TTX}0q>O`=sNe^M*nIs65!mEW4nV* zj+YMWW7Mplb)s}>lm(;!o%o98(Ea$1x)mhnUgNMa3Si+87|a;+yeijC!|gu($4Y)g zJX%bO!ANVB4{;v%dzIJ}GZE`VNK_}>9EK%!xSiMX!}>a*drHogG5@IQXcVQZB2)!IXJE&eB;#zUs9>ubIC2Knp3KL;y;kGDN4-m>z;jqOZqXW`L_G( z=X8Jl`yP#Ux$G)PdPn19-Z8;1MaSb4-U*Q|M^D6S z-kL~Pq9@~%-pTkW?CZPjLx;AZru7X z;6W6Hk*u06i3%TmaAhk0unw}KWX-8{Hhoq z!wmOH%NAL<5p<)}m$KkT?SO?mTix2oyUJz6Z>Av+(l$pw<@`G?byIA5gM?@NDdw33 zprM3{rve+ag35|a3w5UBH9~`#p&44M3bUBH;eMrTsLW>0%`y<8z}%aP=X|9YidTSq zm$I^0ej#xeFMPP36}Osr%5rpnX>g=u|*Ve+@kzT)et(MQBFV)oESJ1{`3O>hKzpG)4^Pk+&GvjJANchvpdgWxN+3_jw2aI_} zf`ijr`DrWZb~4{*Iaze7_L|N#gr=lzSha?k47P@oc$D;qy%E96|JHzs8!8Tdy5mtqMp~i zqpsIn+VfTrhrXY=KAq%lMCp?6?*c~i5*;Te)032QS_(c(gqE5A03<8tJK6}N2%Y<{$sL!ETD`mCa?-&NM|sU-DTq1BE3E#PvJl_2hgMVkj`zR?<~Q#+R>8bpD16V262uM*Z_<`EpV*9vjd@%sl0ZbYPSk~ zhqyWKHR7cl^yO1t|C+30Q5?+Tn zw`jyA0k8#m^6t{{+ms9YzyERm6OPSPYKDisN&|kwXU*6vjqeW77{&m9kE*GhP@%TW zYxt3ef+T<+1@UH5^bq+&Vc?|bytxr~C>J{V7d?3ni4BdJS-Vn6*{DT|*|T$5;qTH2 z=ZVaNWX>*Xj4;}2%giE`4L$+_#Yx|0Mv$hhT0vloglL+S7+dyBpCvmN>c&nxVstJ} z^AK^wr_w=9JP@~Nsq{CemOx!~L7y`yUBMr5EEUIHHN;;I#(&TX17HRmf1^26nwNcJ z*|Kiw+upG$=WPhnSz&NulB_(?^(nMgk~vtDM$p&_vr3apPSWeb=A6*zGclv`_Utd(YW}$lWF&w{bj3{2*+%tpoIdls!y2 zlsd47ZFDFAcdrPg0Fk_1$XmPpV&6Ehpg_vtmxKcO0Ka;!K&Y}I>bgQlOiWQ1;x>cnkoMY()mc6D?$F>AiX}6kYh!O_XOivBT$^O@ zOVq_hklKutpAU)g5s{CHTn6!UNF)C~QklA&sXLi^&9ispR(N*4C;WRl0x^XZ8nI&XI_5LSivhPL{{m zH;h!^TPO(d3S*a4ppd=K!EmEFdQln6Y zMOK0`ED2>;hBB<=?vAjY-VGaYBl(5kM#$GQt05ddVf0DmMQb{EEF^}?>pzhhS6+15 zNPPk1an|uM55{pugDH zwo4^aKIOE65~?u+wLqg*(W1<+K-En0p0JsoVp+HVj`=^Ky4Bt^bJ?tb8>6y%iONV2`g*jI-LlrFZWA>4R}uQPOp25lLbvU18o?FWy}!N^kM zF~}IeLoz?qN&G{a+$1PY=p6DO=>(j^n_L0uYkM$K;7&mAkfQ*d1LuI`nW?u7XbTN! zuiS=P0r|ELNPox{I<**aEg~V2CI}>ye2!_YJZHEa z1m0s9H^k}RfiEn4!*OBO4}2~6zEQBSb``yXHSC=jA@w+B47?Kl8qK3}7^BJdJ?wpa zM!cU9@rei!riJ|@Nc=gf{2SinIS?SRX?#l!&}M2;3a>3q`Lu8wp_Z@I4UI7Prjr&?R@ztJRrcZXO6;WCcxRX{u~V4T#(G1= zc!P)3CrxA?Z?XikiQzx9_taGRTKQVt*Zx`Cg{TT)f+_tuY9NxO)n66{#2szyBZr4N zbKi&%a-?wOn~3!)35kZKXc|uIgWijW)@gVw>u-oKL-HmPgXa7y4LCbyyVIh2r4B6c zw7rACQImN^F$+;yC$0?oX)rBVwd1bi{PlqgVtu~%?dw2 z2BZY+C7C6EIq7Y_0g@R6m3Q>RFeh*=qFUm#J&%-5-@bx`YvOd_Pa4?PcU8mD-0zl* z-pZraA}dI2Zn8ui^{g;3mJ?yaO~930-W;yp$9Mv&AO=vtD#Bh6rG1q|-KDzwFE~Z*EjZ!J#&{#Sg2@f**CH4;-s6rO`p{P$@C~}@muu`Z=9 z6Crrb)MjRi1)K&(OSC*Ba;`Hs3;_}JTlZaf7ro`fi<29Re-Zu|G^Y^=k#ZLC_#rOwDdIhG z{z#(@%>@d=%JYZshkX{=`F}m+~i<`W~=gwq%H#a6VvG}3m!%BrZ-*Ao8mfVvf*NWK{K_P z2m^VDB#Q9h--^lSJo$P2@5Xs~7}1kFx{vS*!+DKgQ+W2^TIfeqSL;3VD8FuUTD=oP zzq2mGl#}J?>EtvQaI#kKD16lV@G9z1;d~yqg;7>-wyyabJlyRqjxihUM8>1o1UrE< zD-J*7q?xnw#7!p`A8Ka?!>zYhrJE5Zu1|q~Ay4=TYGU)S6XC*~H<07UiBKST5H9;i zVth$tpU6!RZz)M}$u;OJ@nbmHfK~82VWWxYF`y%u$i2X~0^6ls-nEGAGfTygM z9ysvs9P*?eXhucYo@9SkbKlz0;Ag|vE8+49_iv!m=O%sy9gT8S%Ndrtn)Y+!zk0_X zLhbQ6s$d;;B3nizJfNxp$IoPU}+74Q@_xD#ioi!g2OaO{*x~lltgFS={t` zPd~2iAo?V(8y;I#^e#d}!vCQdk_`2bR`t$2W{_R_7m;R9+;P|C)vsXjDz$v_F)igK zzYOWl(SYQdio0#^g~v1-p@;nc3r?3S(-${ivw@vRR^wy8MS7HUAnE@|MmK+4_?N;M zrO=p&e|fJ=o8z!;H*H2qeV7HZ0GbGZrtj`UCLE`e;H^y?BgYXjf&?}KJg+S3CI}PK zKNZPCG>ph(mUM#o8mfu_CI6gKWNZsDAH$xZIhr|dKw9BCzlrmuB*ZtCjsYYnbXO8| eMJ<^tTFDT$wN6~R%CBm3`rIk}&)4~-7yci}DWwAd literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c42422a65406243f68db9bca1c71f69fd0140140 GIT binary patch literal 11106 zcma)CS$7=Ab?&`;dKL@@3khx-(Xyn`0!pGSD~^sLiV|hnBqWQpPSTOx9!~WD1N1DY zZV)6g6EBdD6eaYD9LIjhazMZ7ALK26Atx_M`pGAG(oc{?3ip22{GWY3(guD(@W zU0rqSyWhRl{c_pY@H_q6|62XyazocNp#_@Ig%RsrBQUyVV0Nv* z(n)5t>(A_(!LyJY6UQMro=cvCWJ#QW z@360{?RV?F*Bu z@*BOuR(d5$o1&REQCxYexf+QJX&UvYMMw(mtt1j*w;$ocef!GQmA>pYtgL#u(~FuC z_pbHDE6pUzX21Wvciw8Ek8u>M!mRpsG>AK`W*UiHGW&La`*0;R>#asKnmpXuOVhN%|6RfPo7K`K=+k8Olg(wW5ZRouLjKT~9W98`&dYUcTLzu?UmS z4t2iDokt(1hP0ZX?d_-aTWvY#Y%%wU~nCt3Dc{Mtda$C7$Jg7v11R6CJ);Kq8h2VtExoI1qK zY(%mb#q~~7C&lMZrE3wLx6zyWpqV63@r~^cRvcF5c2rl2RHu4nmBY zx77{R%Oshd_6OlcR$S`{5%pM$ve`}#lirO)2g^o@N;GQSBoyhbL6o`afW<9$YW^u- zz+vJ@{u@cE?P`1a9ZlDU&Y+MQyV_6-&7qFe8X8FLp^4NPT0?E!JVB-I&=%U3CeNnc zUg3^5vWGULep*Bg2ZtqaHaEXk8CqL+=h4o(BdASZ*thLF^t|eH zJu0V~_tlwAYmin2mCbeZT|?JcGCM618kEajxkSKsz0*wOS-fagjv0x@-PybJ=t0dW(gd|3!0z9{gSjIGiw#IC~e9X2)58kJkO50gY;HrY$uty{ahlK zsLckFhMQSQ6u0FWk{o`XYL)u-F_HdWqdB~Tt7Fs1yJst zUw-zPa|sXxi>|c+Tu;v3h~w@#1%%PfsI`R`P?yQMuCg!7gIn@RJmLb5Y{ zU(!9pqnu&shE9if*K$1FxG$3+~;zab%&htlfn`iRuX7`dql6(*$wPdTPh zZr2!jLvzm(#vNTgfra2fowZ*WWYvdQ48k0`zc6x{`J%>kE!2JG!Ma0NaO)4$cThhV z=B*69N!$FSgSJVXbCv!C7^&?Pwx3;AFD~=z^W*DBRrz!Ib;Fj6kYpCw?OYoZSlTG_ zqMk^$JLy`ZAfH1O`7KIbpyW>|`8JY(AcRmU@Okt@Y9;U)ZW&qvoAdIUbO$HTQ;syW zk0UvS#4$)M7Ij}gi=$##`&HfA_l-x9?i`<*>O*TTE03`bpdTffBcpUn_FA-R$3G7l znp~b(fefZpd)FKpBXeX8^*x<&$hc#SfG&H$v^)B{#>gHyDM~T%z$Rb}rLHjWH*dQm z7gwIJHZ1vMS{S;!IwbC%j!89#)*Yf;_R1&w_+0~P6v4Z|a+)!$df;|?gSgp>&>*arp9?Di8i;0 zN**mt#`(-MP(DZ`jV(1PB~8Uxv1mos$f^wmX`DtwP5QT?s`_K!kCy;Wg$1pJR`aUo&^sc|gkpxJ5PjQZ#PVB?i=gpc$c>o$q936|FfsMEZ zn8;^_=W~u&iQy;?;!(-xhN~cu($eQUOZTt411?e+uc<^5QfD5Y-yWGQcj(EB_K#FjMfn zQ~4LjOBzx6LrQ)`$-77z6=jh^641-aA5+pq5}25O`6_amzM1J8nSLX1H`G7`ZayMG zk)}LU-VRiRy`CxXX51!x{|_7q4N=9d>ot(Yc}$_DPZ+`k{+4s=)3D@Qs4l;c1a_z* zU#F|@Q1S*PtbukLd@(mTC6e+=VbgONmzn#$cP`SzNtzh8GjfG;tZtkfSeG|Z@GBh2 zD@Z)ea7dxpzK|8U&&D`xV~kG?*cszR3#HbS1+tvlXOh}`z~WFR>6iQ%$$`ZY6=A@> zz&Vd>*c5nJZHH}&EgaYsKvCp^w|JZuA~@%;D@D|XllBgfl0oMp0vmWjDHp~w46zn{2a-&Ss(=_!vFw<`tF(|6Kspr zS_eex0DXPk1((tRlFYbJyO8?0F5)QRpqgc2-6r5=xMoQeAWbdL5`Z6))EHo$6@?w7 z4N&bmP6}HXRv0i3(2D>D>T`te^f3Y<(I9Ee-G{503+D}*EP2*~If7PPs_2)pw!yJ! zYa0UdR2V8zwKt+$;f-dz6{*^cZ&`tSIBKZJ{3ByulKr!>FR55 zqvl4KfSM-}1>R&4@0FD#A^=27L%c{fqW~$Uc_m?X9N5JHro8Wy5)26Y0UI=Q48c5L zN1*TjqDJxz65sap(ga_XHwPsB)bPeg>fLp0OZWE47cA}XoXRnETH7YDDv!XJ%#%^l z5g?K+a^c25%1>N6f~hznY6;XGGkqZv96Sl+!c!OuE>d^XT`WLn_-T5FQY>(g1GU|MurZC zyT3!`0Pa3udyR0JDd&bJ7^S&x4`zivuk`?aBMaq-6(FL302=2JJWHA7;;KLrIGq$; zL-1@5r2l=rZ2;Wu_q?=>(#o#>skXY@04JJ3M5B%6XV~0- zY#fZ8(q>ZR%zrV)CPS;=9lW%RM<%3Hj%FDCzO%p6m>&Ddl3#w2!7h0Tnen|crl(B- zaDcF;AlO6Xvm=5cuGnkoh@060&s;=X9^nnz1yGE&Ku}CYN#;+9D>OI<*RqBu$vTp( z#KWZ?lCUot_)rtLvb=F(BM7n@sZd|;TQhDVE}_m0y>gS9vIu4ocexKtw^0`O++FU0 zWMJKaRQaw%Oit0TJ&KHv@R(sZ`v{W3KQ;DCSolWnp?mkLwrBWXELuAYM-FOMP?+Bl z38Pk47}L{)dD|gUO6$iX{co8YlQzoRlI;r9h=Quus(AN^VedPF& zV}~n5hn-y;7>nr2W(93nC~+8dZBSSHswTJ4IEIC`9pq1tWF|rx@~3q9Gb9aPjTrF> z1-84CYf{3zldv<>JDJTR&Wd;xlzxN8a2|Di(D@o%0tZ|YKK>U~6Krz+(1_)zDKDXV zW9ARwCXLd4^ik4k`lAu_1}dMR-Yp|T2nz0acZ{J9$0jPD8^^~uswBqe%oXY0FUQdT zMClkqg}#co`|tD)m<)2}@ALp0I>1lopABrox~qEN6ug2b>WEah;E7&TVC3{7ALo+r za-1!T3UIcFI7JD3tjg(ZREcI#s}@zUM)llk1bJ#3$c(csscbGRrp$z- zC9K%+w0WRBAu5}Po5&oP2MRTW#3Qhy!ZAlg5y3hL@D&%Py%bYEJVdmISn;~|ErfPp zA}G}0g69%1c7|s1R9fue-t`h3LgK~c6l;uYI2;z4VGe9!KbJd+x1f$Na57Z#wqtdQ z(@t!kB+bffj`hnC-88q=BZNhaeF`auD3N3{7%nBPui)4Lm|?qAVKP04$4$e z_A%3ESp6kRgJKURbZq?&zeDDzIsDo9HA;kA^cv#;$@FWex`^{&%P6a{R;l1HHr5Bt zbS*2i$lXSqNfPUVUB`=eoA@+AVgoKgnDWpBRna-35&WiS(E}iw{yVZw@Eza7O2aC^ zTPvCbYlW4yXWap1!j=wJn3_mJzuPvf*8uhjffVc%dN(xr*yfobNgqdm1wj;qOp1`} z^899F=plGg9QxQffR)>Gg^P`vi?DYt$}gfE!4_D+^02(_jVi;+u#C`4Y1h=Ti@N#C zp@UZId8;#dt3TSjcBs{(S||=@5H!R`Gu+Dfxj)ID`xEpxzYF+K7Vw(kUFT1yebCJ_ zi~{~j!)%xe1wWvv7f8l}zac?1CHRm8Z6t?SB550W_*Zcxv=hWEacKm8h{f zLqsJ&bWMax^?w3K9~wxVf5FJN&5z7Yhk2w6ur{`z4UFyQa6S)g`VAVCzomqnNZtq8 z@p2b;In$SS-0^M}sUo_5i>gsK)YjBt1(kn=g5QvK5}gC#w4k4#8+PSh4a~{=V(0u3 z%Z*h}t%l?%5_+kj2QzK@_TxJCM6oi$LAo=Shi73ymOH&0VOvI@Wj(xql6G zp;2Uhqc}2cgJJir(@KLX?S%;bvV?bc6M2#fU5RgBXy521T&ewzF6B-jX;c*{`V|TO zfs$WS@{dS@#Uw@7#tkN6w>cO9GJ=JC7nFDGCgKdVS$!LTu7IPWE4tTp>_o6jnumx9`>v>J%%6B_y?%!*^cX5&fOC49*rH?3hlXIkBOLANw2}Nr43>@ zx)T{)E3n7j(>=?oVb_Udgf+fl{M7oNornIwJSrN8DCbA$#|lE@hXaDaB5iMDynssJ z^^=joKyD2NeTi?6*ku@(2eT8VFcB>sh{hH#Z%vXNnewv#Shh$mn51n|`S+Auq2ymF z`O+}t`zXCc!+jkYPxG*$>J<#w7Uv64#-mB)i3drg|Cv$r@}Q+-`FD*X2Q%-IyWvda zwY!p@*LYrj7pKg@hb6QTz`U10LXs^~TX|%Wf)X6DUP722$%MxwDALCS%eiw+!xEfj zi82o3&kL}irUU03kq90+s-kkZmM76S-dXs{s*^9%c=Pout8(Myz_CLYJp2_MP+ybC z5)A-thwyhCyb}^U`4zVvV2BD2O0scB4@v%}Ifr6K6`cQyrWIeKbxxnw9k5#E=gv_d znYbO<#^koGv=b+ouWom%N1x=uCWrk2Z%+r$9`*G5?wni#pRhSi{6y_aef_ALA7DC? zRz$FLR9)^Fe*+WLk1EeUAW+o^J{e$xGS?k%psU4pnMylIGUp1z0molCxXaNFrp2uK zQ$8jGATiaX>m#_}Cl`RddCe61~VKE4~eoQ)2B2(D9c)}om#KswQTKrEzyT;ZGzYZLjw#sB<5fP z)WHl>Dbu@B4y7tD$wO+Z$Q%#)EAkKIJ*j@m@#M!;o-D~`e%}YoL5ZvERs~bt=>EFV zXmo$y?>l;Wm5Qt2I`Oyvy7JlYE6RUSW%4pmc^fI+QWYgqd?ivNEm1p~uXS`^?-;(J z5=~D`-{QON+kAIAB~4M4j_a4XhmqKwl3(JonRuOwU*WQqR6A4t6qoI!)|vLFx$GqM z&Wu09a=0KS%D6fDr9~}YZNI^LZ%F$>6l!b!w zDk#UICqQ|kpqvBccyt1k69wf5k$zceoZLF`ezz5ixZ8|h?sb!!ivoY>ipS3Px~=vK5#I0iH?vD|7Dizf zf>8Y^T#2KfWLey$F+j0DxpaBCCpry7)-JZYaVXGzwHLh;rm?Jj68DpKGtA;BZ ze{kia5K~LyWJvm?cl{sL(D(Ock#EP#%vHh1WezYvh-OWxA-{{98OJ#Yr9Yt}MzgJGTyO?q(j@sDQ zoK#78=YtM!y|hG}ZHjKwoR~e8EhHT&O^{{EjyU-N+XSv z7LOvHK)X0j8NKYTDoa7oOu{q`g1gGS%V#!IaV7yLCufrO)ieE@5SZ?{rL(V`Nx=;- z_rvBo#4J5?ElE0Oat4lX#LZ1`NnTCQbaDX?Ha5dZ%_SP+t#+RH@oUU z+f%o(i@RDuH$c}0#;mf9UES3OCigZ$H7{drmK%n61|x~*DSM8xXOT55af-?>A@i;4 zXM^5mCe;ANb6lKjZVCWi795zYc!8eb>_wEM-Rgo_vKP>TCq;`6K@h~xg487MfHbj3%~ffA%IG6rL4&Lm zo8OE9KEcEkX{!XyQ!F6+J*1SF#!)oexUXs2_nNMKry1G<%hc{`R`uS=zQ3p8+|(g8FmkI|s_-mbD5*DBPUX#(jANu%rk7b%Ug5#PA(2L$Rc6>;?}9fcDlGCVn~Qs z%vQrLvUm}gc@_0uYq3DiMX4@rnV&$JEN(ryxEiL5-QK7Tc+g(y#?fNdTWn`bO&C$5 zi-gWWnn6m5i&h|NU(9Z+1NwgDnj&U1DruOzj%3`1Hn?MS)kytoZ9@a(X?IoLu4O>e zR6ZRbUDR&2$B?d`6GCo(MSB4`p!(~uK+-g4_zne><8=8y%Hvy@q0q_1T3Gv zUk-%`Zw4uBl}ws=$x1A(tVfd)Z_+#n?;#Kj18hUt#s0Et;8l~x3U~o)#i%u2FqZMR zX%H54g6XSB={v|=(*hLbKM!l=V#Ta|W1qo#AFEd+P&7^TRAaxcJC=LTa&j>Dz8eGh z#JVzOij#Q30qmXvfj;Cs5{v3UT~j{S3AeY6fw~LU85nySSZs{l=C+mT(8Ok>W#*a{ z>38&PXxue>U_%c(u;qy=bx?|wO-1}-P~9$Vyf!H9RtD8QRaG`# z%g}~?&^53cj6Je}?d5OwaeYHV1m@MDw|z6pZuVoy!7_-SHZ;Eb_Ie!m(cGAV4X_&a zV`*G#$JfQr(MhTod^_#6vQBtIyha4G19VAHjzN2g4O$Y6sM!B&+R;}R6gQ``WrPtB0 z;v8k9Js#4cyz=6Q)IjqPuT#dL!`WDEzJZ1>k8%U-C75FkA=yhTS| z^$6PrrDZ`_BQ7CZLBjR#|D3C@PaB4ht-+?;Bws@KXp_cOc*K6Cod(^wcOyu%n+beS z@rP&=pCfA=ozQvgK#-#Ir*Z{7;s@~uM6HjhBi7IfvuHTqAi~m@KSD=-uOPEnIveQG7rix6p{mu(D^=?C1ELJSC+=Jo#6Z#rv#i@F*P*L9cW_#S!VURLFFobM6p!JHV`PcE8x`J z(;UJdc5ohOR!(q4tQ3L)(xgU(dQ^MNje&K#I)#MH#-e^!&fZXa z(%pq_RCZ0G4@v_U+R)gupeL1GYvA$N6^u<7qA!b!c&qpjS;H0~Zct`qzleb7Us6Wc zCnCzo;%HR!t%`~85@jvQh;@A(yU(gesvA;Wm+CcW$$S}nyI2ocN!W1my|5}^5WY{K z@utQ z`-3azY14#FJsz(#K0}v(Kw=XNO2f)o$oLlb{YkG&gV$wQ+C9tGzSZpNx7wQql#5aL zWYlU{^?QQGqoqD@kxPz-VHxjQuev9g5VH;u2I2;xrs! zbf9=B(CD;0M0_gn*2=6>Jg~!*96Ivv#!ZC$#6{Bj%M*4xTZ6QG!;50Kqzs6bUPCsi z2#9mZbpvFAZ+G-DhATp_Teq0L4V9 zK0=vd7`2vjprFkMCy#-IF@(`^3{@OW(Z-*3SM~L{pEha_L8r74xdXcj-kjVQKs66q z=9(0N;t5R}0M_zQVw(3}k8cLo$V?J69Pb@&aTNMl3^VnHh^ zJ<$QkBPJSPZgMJz5w^pKD31bhG8(0r&Zq@)01H4Fe_p;ahhXOapj%2X>Y5I0t|2Ot zS3t@6)^L!+YcPu3+uu4~h~FfKj9^e4f(p_RaoUa$?`RI=3wR@Y`&2)Vl#=3tb3kUg zHvsevVQ^dPLSqsnBHW_^+PDp%SeWdQddDb&LJrJ0<7})!;RQptD|qb^E}8x8>}+nxk$#ao8Fj zFmEdlgV*ED+YU|Yx4THaqL@aOCz{KjK@WLc}LA-Jb1#(Dxgs8sS3EFv2yS+mLCba(0-(dh^qdV)Nn+?Vhe+wwgWT`ujjg#uKUS7@v%eO2I4X>1Kk^|-z@heFQV-A^_j4g=YZ3BA{qEIvJ z=^7$_wC7*UY*TPTAJ_;sI54GvB~VtV4PgK8n7N6zQNMClf{)!qIv7*A!cL z2QQG+;jD{4rs@D$!_6Ujn+S|De?p}d%GjnNl#yy%n!JcPOB@0^Kc+P!``(42T8FVq zfgl8czOU(sOV1uyu><(>h;j6Ie9>rPzb(kzKix4T%hPVmF{(;FuTHf%R{P(lD1f#6-7GVQ724Y;xv+TTFC#dz!uyfqVn-parO zGM9IB6$ii@?a>n-7f*b=F%#hlywfizKHgo70WtC-JG0@K6eI0u7-fbX3TL^YH*|K1 zeS|+arqRX+k7(;TZEccP#6H$RyUokJJ?&x!CCl>H@TcaZt>X@=2eQdF27t%nEN@1j*AFZhBUcLKAVGrDLv-BB9y#Ato<&g~bo{H{ zhe3)k04X_7t;4!@77IOBA(qSH&+x|O#?0?# zdLO15=V&>PQg#~|BK`0n|AwlsQ#Sr`wu0CI`U(vn(pN8|aEQ+I9@UxdJ+15%hr5>H zKG4dvPR0Y|bk4(vzLrscAZYByYtjyasMo~lvxUM^6{|1!w~KxK61&?N^f{TCne*jRRR7hez< z9{!!h^oJUh^r?UDF+=ADH%SNmp=lgN$jxt3(}zDkdIIM>NxFms)Xrv?{wM|94tjk) zNb=7f_Tq=0991FyKoas?_<+=Z?y!d+(T0LJ-d{YdZ-E3nK}H`2DZj#fheuhtu>6qT zN>-+{;OF755`4bQF%k~Iu$LsrfMK4V5F{2U5+v!o36{g7Z$X%pltO;S{49{c(GUaH gZ~{e1J*s*J*(;t_(~$UH*XyV1=j+S$*XK|DFTvBy!2kdN literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fadbfcac5cb4143f52989b9fd1c2338eddddc022 GIT binary patch literal 8122 zcmbtZOLH98b?*1{^gJ*afOzm_NtA4jY=IKxB#tA?j6m73OhUA!$WBK}dpNfN4A9*( zxZMLjGLRjCDk+uX!W(9LNmWX_RAu9$DKp6@|TTYCP^_HC}?#3jGPJTr3RvR~%B6?t*RuW;Uus&UP)ao&mQ@r*wcH~dCC>(9nb zzsdYkG#AhN^PG31Q}Keo!1;2t7%%xt@v^_nWiL7%pYhLdz7m~{&-v%#^Zt1*SEDb) zPx()Az7{iTSX! z^`(8&e;%AuVga0mg3|(LQ7nP8RB&DZXIY#E=XAmOGB{_%S#Zu4oUaJ|ZLM|gQ`+gZ zmZ_|(-856~Tm7utOWKjL-x>6yu%)Zocar`fyB20`(azdncpG6Bv^!a_o}g5}-rfks zUuIdDP|KF3=2v^sAWp8gWjl((=xR3!+fp_1fAq826R)?^P|betyYC*WH2eMD&1n`! zD81c|F%^mI!n?gr`^Koo8}W@$h;FjcGSqpR6y}ijWDxg67zO=e{6MyoZT0NQ?{D^G zB!aYihgRF*!O-V;!G<~|!u9qb%7U9=cVjaPqF!5sQY~+;7enRzH0WlbY-c^1X|0&_ zTZ0U{2S)kY^>Ovc8A3;{p!h+=3YoF5e zK7~$dEgedi-R_6~jRKzk`DUxEthk+RDytKABV|RsjZ4bL)^s*~2M3dEgnnrq8=1Xw zNty8=Qf9w*Q#qYp(rITcTb@EUaskAjNy1xMfVZIxdh5X@rN_RLM!kNR-q+*>RC(~7 zi-S~Nj367)MeO9o{%r_Z^2*A~FI`M=CNK8eooz^IdT}?3;)}U>gtx-Z029iK>BYF8 zt@LlJ*?^^+&m$k?JVr?CAoJFOUe_DCtG}RIx_l0-4${=aNCwXTejg;%eC(QUlyy-P zMraC?w0>Dzw?5X^bzuqnZq0Xu1C4Y=iK!cwMLG0@N2PU7R74dhQ2AK1G{1^nud6EL zXsm5(ykjI&wTRU$h*`fn=8;*Ik#~`Zm;MIlm1+BsoI_35hWZvxKQQob;@=u-RNk^q z!D~#FX0ztFr%78He=eKf*PsD3 z8*@Pu?zWMB$TMLMuk#e%dP8&DmOsyPMZKRzqok5P?mG_Ok(y%nNhYo4k!P!F%<>~W4 zpTdKlyV*Hvk7N&Bw840ya?i%t=Y<9-sjexv6LtH+&i1adceaBl$#u!w-VKr*bo_E( zhU?v1K@|Hoeg%}R=0KGzhN=>Rp+^R>))$nW4*I}F zHwd~(Hwyw)5;;KnrD9N4Yu}{?f#}A5rJZF-0HucrOW1x*ewhgCeRG16qcJ3@mntPa zhDv4qfYxUui7H=h-wx$kPsYmH!D^MoYpzgrw>JQM$ljKmLD4TyVAllJ%8OLT*JflIjXeYlQ-K^NEFf~5TjIwl+5c-;iNtMT;%IG?K|^RXI(VAudm&&krF5EID`^= z0V(kyqI@+7b_VSzey~gu8OFEF#@B;PRFs1ayLR>Z+C=rIEG=OK4NH5HZh}l(TG%1j zS+)fxS%M5U33#t>*=+6fdt|YktVI5s(DuzeCv$~4bcXgmS?6-*p*Hn#w0?yV-&&Jj z!FZ~Q<4LjtQGIJ8j0UZ$dvf*S;| zlakh>Ot|}!(X=djHxsVXD&>+qMnP2>p#mYOk9PiSH-SqWhoTFd$dUDLXspXb_}KZx z3G4n5iBH_K9muK6h-4rKTzgn`fOY(%GEizJKZprF>9=yMIqEXgF;H5iuF1MU&*Ad0 zp6h!s+y`(-fDME|aDm2rT#V#(RBxH`Z}BJJC-QeWgO&G3#CVekuYn0Kghc!hDWx4H zrDHX#7n2?7CHGyCe1Z`2!-8Aum&r$NeF4A0wp=kZF}Uza;5x z2WjddQ%aBgA_)9S7n6J)F7)&Enc0CvLYZM{)JV7UeWg?}&1epl>C-BFGjypt^?g1#l{Xl7sZv__0 zHj-0_y?X%D@v*)R2Sr$S-OR(g%D#R@+XvJD&#RexP^0=>sv|ce--3KJ4z(-V`wM$! zHaj$hy>G}h+;7aSxy;>~rxNb-rv##0hj55?zD{byHo#evr!iZ9G3wm}LWi9$LwrQ8 zF=8;NI7y3~LrNDwHXy$5m-Y})_(c0e-#310>_8k@ep=Q+YZ+VQc1t-Q*Y_4GAlMlB?HyRC%@6!^_>6yOQg zB`gpO7TkBj1Bx}s8tN-=mm)&BX<_4L-h{;={O~=D3AcsgT6P|CaFc+yBUR1~!n*V* z$gCG8F!y#+RoZN)WCE3kjZEO`CY_Kcsmc_Af{zO;*OcJ~R)E+9#haM-7#=An?=pv^ z4a_Z98-k>NLrTendN#Z620Qb=x2z@Ic~m!Ym~!E&S54z5*6*O41Xt6$smi(64bPmB zLzPGVc*>6Dfu)7u|JS|2+l*#DR3*_nxnV{m-~-0wIsEYAZX&Yo@{wT)cZ9}i-2(2@*0aH=yQ%b ztJLCT==%lU9)Z54Kg?Mbh#Q>*eWJ#ZrFvKcC|Y41@h3;jhz4BSnX+DdDpiAZT}cjt>{6RjT11}B;~vIks&jz|8$iFpY0pd=ko<P?AAQxYLKIt>nrvR4#pw57h~PiOsx&<{*MGzU@!lg)G#uk*IDu zo^tLN8HXH|9>Iknho~g~9%O`_)JlE;;#WBQoAtJFr93uk<>coxH=ej%Oq_xkM?`cb zuZQprBEG5#G{*M@1f`AdAu`-$GddN-AV+1qHo6#O zI1ma^2;&}L6CjG~M1DB?7C`hILJtn$(>Xv4_)wSU=uWqb|D74YDM71?yP%86JZ^dj zVw5SWw@=o6hAq3j_AP|i8#0W0yO<*7YXCVzq+ z)>?D_e@v&lk|h5}{uo4=2=d8)qT-3ytXp{dTcq?&kYiw9e;n++2S$as2I3Wt!+!5`v{xBE~k$yUe&N0KIST*Y_!1IkL0nI1*F;xhGZ39ibKSt9B_0knmOCfA;h0j=u77G}M3kbE@W_Bl5@sM~6sfIE3Io zaoWXf3czIy_Fws&{&TmFB#*u^P9UGXLjUC_wH#L-WZfuT85cQ-nMZtQPTZ}>z)q6* z_)k4)ph6va7}hUVJio?u({N`Hk|#-$=U-sn5z~|g`RwwTc)EhmQh8*6qX6WUC=)@I zA~p?Y2Oe&IIsPHuv7atO%8^VRQAbux)=BqONDNxN26fcZuYtJ6JmSf?4^z^Y^r{8J gWmJtuZo&-6FC>;?aF*$P)~q$(Yd+n)u<+vl0c-Yvb^rhX literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc b/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c33e8214076849fded9fd3567498f1a5073fae1 GIT binary patch literal 8949 zcmcIqOLN>-cE%fMG`iVrQj{pkHe^e(dSsJ!oJl4gN7h&m+cOlUu{}8QzAK5JSA2=HP$>$ zCY~DUo-Tfk*i1s-e{2?o1Q7kdSu6CuPn+&U5N zdv(FLqM3Nsn-%3!bS$3p=0w?!j>jjw6QW#>=HrvzNl|v9Q}JoUt-gt**U+hdT)YLXEWf; zj5rs-nPtboIX2=nz?oym!8tzS`~;j6Y#yBX5$7T}C)p`*PK`KkG4*q)ae8p+)1(#h zFlmNwcavz-y}5Lw>&9IcMsBkk_kyf_Ckow-cDCw%(qCC=Co9*3CKzFRWi?9|zN6i0 zsJXV(PP5#)*~{A9B#3hTi+(Q(mm6}fBnfEc)1=qWZiHFDf-C@|dMj87*)OszOsGji z%V)22qkf#+3V0AjVYJjv!hq+s;ukGfyX>PN4fE+=|LV&Fm8L)IZtQ2_3)V`Q`Mlc; zQ8qshVvL8QIQM0@8QdAyz82pJ8N;mb<}5Y9c+xKS#R~Yn(OCSAE{~X>wg=Q%O>~Af zlj$h=F&4Ihew6ve!uU})U?I=vS6id5MhoV*vycZ_mqt<<4eDk;!`gsh)8b$0Cav~L zlh$?bHPM7W1~-&E8QP#QIaE6uQ;4=SrZVlm=INmUj@eR}&W!uow(MEV#9mvu@%cvk zC4layeW$hTUpf^|h$yzR7_q4Pf^+`I?FlzBRRQ0r#APXN${8c=B|I5XG$`>PS zNVFKW?=1EP)KNN#($ov&BM7bcZ z;W3}?ie2+#Y7qPDpW~)`9d%1r*4pR@oDa+YwaF7 zPm-`HplCtVSVvF0)v{QCgaOM!Ng6x)PXQ%(zf(aegwH1%0tOlph7Bs zHSsHV^f?IGnv%}_Qu?i&8Qanp-gcCZjyfhv=1?8gc#~=roRbpXS}1GuES^f4-6>O_ zOu?V}fWF~mmEvoil4vJ)szZfo52)`caFkAsuVm9hv4`90mL|$3vryKvI%-h6(oi32 ztn>ii@@MFo*)#P0+3XmeXcSaymX*af=d$CYnW^|KzDF%jFoFkXnUj89f2?@*FbZQp z{YEL|WKR+L`b3pU|gGRseNgUTrfL zCIaj|Gx1l$fTes2b-bwzUyJ;t9~WG&v|F^O9*u+R)Lm&Q*bMn{m&bVtvYq$|iky@! zciKra>a)E{;S`cUjnuF>Rk6GjLu&K;^do*!oJnKS`d9Zmw5Yt$<*Q<1tzeCSET zD4>caBo@?jf5@EiMXLJ_$YVJ#`F=AB(iCg5izQ@l{t|ck1&FO1vL(;ThHNW!*#W{@ zsv;}0^<2?3x%k7=6RYGDPdk7flti7Uj$sHcYU|^=&eOr%K5$qV?@L}9Z+x0p#+!n* z39)Ev>wYt6u7-KFO@0Nbrf@9yaSXCaJ21YgC=dz?kk3#UN)IJ6VOt7HGL>N3^etnk zcNFq2g|Sn&%q?q+BpsHOzk!-M^w}I*k5tsgyCMDBTV=d4h(9b1?O}PSJuWwf(Skj(oldbWFb?2k{&JoGkWztkyAv(_Whhne@^5Qkq?0z%Xf1brgz;l*NPSL%+XSL<*}Gx!E&R3)jifj7!DVGJ;0sTL2R>% zt+wR4*zzj2Zg6b>R*kIcV|f{4dmT;q8$`~6! zf%~J4+}N1?KbW^d%m{Y(Sv1-xS`_+C{~6>ci^inB%xi@F3G@PX36^b^tb~!7un~BI zCP9~kjexBp4s^-DZy7d871jmzYMa_r@uu1#3uVkoOoRQ?VIeOG3t5AW+VgfBjwgEB z+e=+o8e{o7e+z@<3n1T-(GixXF~bp6j%1G13PE1f^T%KUcit-z10H$1%YRB$|AYtu z2>(+Mc$KXJx(P%0pN&}r*utO6)u1_Y!~_H#?Bm#LHe(E2Z6rGErM z&`LmQ@7S`oV`(E;odBiv)Ur>?idqv8s>lG&+DOPIkSSd30t3kr9be?z(tcjSon8hR z7U+jx;2^vV;2=uE#fK*W7k|u$RiE&|Juaa(oMhqaHZ=Y@`r>34_-}~(1qispT_$Hg z*SahIJx;216wv%T_$lGDX4tSKbXzi<=c)0uM(|R6%{c%te3`m^DYytY;kWjG{xUrZ z5cxGq|A;#!7(N6d=L!(1KjT!diF@Bg4gpfL!LAI?BbWaXygOnbAwL& z>571ts_>^vv_v0~r#$$FeE<`<*=DZ|N5Rc_kfd0_7#{AA^{K~j^MCf8Uji;9h;$3+ z-UXKo;x$yZVU&F}1E7q`I;nRd0LC6&CM+)uBW+HuLWwy!DMxA8@4-(A(rndHYU-dq zS-V}mIl`nKb|uK~qJD%L5tyLjQ^#ed;WZ4k|>L0xPWR(X1NM*CD|>JLzvo9Mc*#e^C^pg|t0 z=AcexTfQo7%cIDz%8WIQ z|16tg=C&*%zVgEwv+$kiq54Q}1Ak=+t?KDlu!Qukl+DnWaGSl)KxqWC-^$E>pcS=A zWrS5MkE|j*PIV(gz!fEaYhMb4tOOam8QdvB_6lbCwYeo{_~y6rw(|GNT@qiCK_o(G z@6egD@J=qb5xJ2Ph(Kf4sO9zt5mJE^t6!?Z5DU#SBwIFi2ImE^)A~;|Z{Y)x{YfvO zFmA@lvm2Zj@aC@YZolVJ=FPq2zUv}ncM-q!x||XLn<0#?0Jl^aA7TC*$Myh)7$MEj zv$65Wv|MdBvs`1KoJ@~bodlI4;>+!IN)L*1;SdNnpu7i7a~;UK5_(qRM-=3xUQG(VSDJ2^t(?d}o2KK)Di&2d0 zc3W<`8uUW<_Jun^)?5`k@s^v0cl{_#8n=tI4t3Jdc?*N%UyDhF2;wOx8*;J6e*-3B zGU3FmQ7vH`A}U=YLR#XLiXCsWd$}%(NBK8rNc_n`ahzEd!3ma~mfm>kl*XW4Eciaq zJ{U;-fQv&K%y&WCrNU4A9#r$MxIW3VXX>Ds-FLqT!qaVOy6oo*7Q?kCqr zhaVd}=pjus$_4Hs-JxKKOaZbCDJDMv7zp)O zmVt&x^ay*@L#uJp>xddfl0^7Vf(X!b(dHJ-N<c=t#}6Fm6Pxu&Y`+% zNc|sxmE0)y%ct=Z*xI^+U=cCX+`*9-6_F;ySxH_OGE$s6;BX74daHSGJBHg+#jb`IxlgSP6kkNJ(LB0%UhR}WF1#fG>_lpI-Ga&oA#9tX2 z58=i!?GaF)>4JBW!B<|F`9<+gXC}RaEr2E1vJe4SaDsHCrb|ezSwnMMk)^v+!mTMS zUvJ1>4Z7Q1#{_WDh=3;x2igKMT!RwPL=@G9R%=k-W4iH7f%(8(K)`^9yt41O4x+?p z2WoT(!^G(y5{t0#oHQUedDx3^_{l%O6Q2fY*oB?gAjUl+n?z`R^0Kg0DHfO<4(idX zh*Lo}Qo~#qHTd6Bll?-PJ7PyhXNIe^+ow^h25GfqXU9?B}}=t<0#sH4^AJH zfW2K5Vy7El?M4Kcx9Ap&(*zV~-7T z4%;C6du58a3etIoWzMBta~WxJBOH zJM)sN#W_>q1yU3u&I?42-@EvdW}`EZi6bkH4vrrDXiQ_&Cqv-9_mZv)XLxc9zWWK` zJHLBW%Sq*a){fGJ$zw{X`{Eqs+)?cgfh#BJ^4z29j6tiz?;=@JxKZb6Rmoe*4Y*zi2SgH2q=rSBM?^Tnt_peDqm~l`l3O1@BGB{_#8^rf qK(up6lZ*2Z2U>YTb_@rW5`Rifb~Hz=DRs48uHUHtynbfx>i+=q_(#P6 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc b/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48ec36719fbf1ac41ef127487f3c9ba27702a843 GIT binary patch literal 8548 zcmb7J&2t+^cAx3_#9#n|;FqWmFJ)V{;ayYK+Sz=09eZWVYg?kVmZC~c?97hP4HD$c zfb1Di77MJTLe*M__>f$7kK*2R%`KHf{*0vNl&T!Iyq7)f&6dUdUJn3*6s0(W>WA*v zAJfzC_3yoY-K*6+4bQn>|L5wj-qf`Jp~B>2qVNt<_MWb3TnjX=a~A6f3s_aayEc;N$$H~92DjZX`E{lz^axByO*&ww)n&P(9D49+Z{ z17~i?c?Fz#z5vd`kkbO^6kh~qamaZUoYVXaIA?~OA9CY{);hcM(+|^*kRol1*LrDu zcj=SsH|#P8H<&cUa&L;tzg*8y|mL^MWOa#y4laKi9F&_9)aPliad8PxcoeLJM^42*!0`a3+k7`K8vuPT=M}KQOrk9kV-3@oesZ=X|M| zTHrzl+~ORha}`UP3VywpbxRqJzr3iY{Uj8lg~*E9I3MzE(l*pAN7yW0?fpC4G zg<7ufX%E-{`E_kU8?b$beW2mpY~^c)R?r~3*Xd*;FRZM;iG^{)u$y-CFeK&5MBI@J zXeUpBJcK%hVLOhpEDRrN-+XwnpUI0cq!wR{ySFcH-i1P>Z!BMW{UW|aytWy&H=t43 z#qBswE|!85Ux;=et6Ubdi@nW!dGoG3i{URJW%LcUc}m}T>6uHjJep-u*&Hrh)}{{{ zJ(M4>M-zoy!%FCCCD?+tuiwDuth5Yy26ctg>B&2hIO}*yhh1K|R|_2O33t81tHR^HsB(u&9hcX5{hk^4O2cc=vPR)! zfyRh(bET*#6(;p9Dx}?OBH~$L-tI4YDpaYQBrFDPso&F z-Pd%DL>sc)*wc2c0o%0)Y+%h}8*H#_?j!9V^?^OGPHSjmqNS~J7Pf`O*$i>*+)>J5 zp`9@>x$zZhLpQI0s}DIQI9^^wFXYy#x6RG33~+t5sjL+V`>3rV;rZ|Xc*rD)L%t3I zSg|tf6w#_Fol}CAuTk|hk!6s;rRAW-2^?w*B$Gb)g}b$}9d63rx~!{Wub-F23Kh*L z&(oGuYONZP20MIpu=IanXE2rb@+c0+5;?B-v_IcQ%HSPno@47KV+ST{=pKH)&KT28 zovEMuEpy5o)7?k5ZKB41yfZhc2#4FZw$|lbq+yoBQe=gT?`%iyHBr>NWUELQl(A~l zW*$8?WL%`nNZECef%X9M-`785yC$S>uIuEWK!)ryW7pnwcHIF(U1irpO9lx$19wmv zcmreK-i16L7=xoSi%?jKCx3#x{1Xt! zRl&xe5`!>8{y9isKy-y37W!79Zxs4=P}vZ7%Wn*-+fm#XqXJ1Wgk?uZyiDX35bU~MFP8M3k^rI9 z$QI>ZrE2HnTbD@`k~W;eAo)YeEsco+M>Vk@f%g_tMw;hp%*Ld?g;qRf2K$bg?2&D; zZ<$^HhCL6>8FNjm`K_eBZKNX)iI<-2D##&1ai44~Fbe+LqBgX(u(H4;8))UMyTGCf zcY#Y4Dgl!`UIivq6(*5rYsD1S={?bguO+WWcZFPme3gr&HtG?xs>~5)K#P?9E68!Y z8R+Y@|IM7x0LZd#BRK=O178tZxF}WdU&+1o5;yd9e8N`@yjor#_L&;?F;t&MK8^k+ z*Y=EEBX6Q^2Kb||Pwmm0X7f494|~#^%3kv)ykTL8N>JGoVBw)2G({{DczbucnFym! z2ly+0jIS(wn58ri%VlYunn0`%sJc4ZweY*jp-G6r4B-dVxbz=OY!9j;jo`|L3gRo| zQBae#nB~1NrsWUQo=n2q0tzc6fqhXa#R7J5ZlcMPlPzY)<4ueu=`a&GoldIkjL~Ud zDjc|wc&(JASiuZ@-tIPh>|Ppn`Y8&%w5Uxo(-LXX_s2y8>jH2*oC(%bZFO1;r8DwF z${q6+2s5{kviCqd%f@CUl%yXaqidVkw5IN2Hym<(+t%?rV5+WcX^t@T)G@^!!>Bw8 zI+9$eBZx^30WIxir0icn2Kp}9FijF7!ouzAyW}L2tH=fhwAVlx`n;eiK77iFcB8(p zDC8XWeSp>qLv>)N_P`pA`2ag&%%2?bgNfd7$KaB}9Xt9VFlw~_Qe<5Y?@Im^wOksLF_Qiotc;M*bZq8f*~x6#k6nXGrlNJ2w>ku;IQq`Q zaRH3{#uqRO48SpzrB#0_zRLHh_IqIK6}0OiWlJEyR?Ga3kkvNo-?5jTKvQb{1e&VT zJu3cq3uFRKYu^J+2~%BB5gz=&3Vc0J;VMv-`$v4edf^?e&WarGjU#wUI7+KR|JmQriQdc)L2%a2t{)o-F@W7x8RmeVo>ZDwIUvp$O)jXFnu1xA`5wwupq9xQyReW`Z{ zwXJ(@ehRG?_w@I)J!n2!o>n+)uUt{!Du0geZj+-pdgw4gyK@F9BZqwzGyUAz)pPB> zc3XN9*-=Y4f{m*md`GdRrivY+y96?*&SfkC}7Q>PjE5q;iN$sZ3V$tf(V z{qfGs2~sR7rQ_V?nL5E1m61!GDTlPQ4Wwj_*4P5K%avrFSla)erYu6dzyaXHJ=0>j0M0+~v{pVA=@OrR=D~lR@ZV%qJ zLHU6RpT^xfk2+h8_LmsV9=Lde3!bBRWEbF{t2elK141S81aK=r$`HiF=v@}Pak+G0}4R))J;Ntb0*a{+k7N{F&OGz!vA*kodp->PF z_#Fw-oB$P?LgDbz9hVVGC;3qoz@>@}c~(Q;nI6q(j9u~sQ9r5aq~)~Umyy&F3Ytc7 z19&_GbKe|IPnTUZXjxBHqLZ{@nr2AiYB)QM@%$!bFIQ1J(e?23w1x_Y`kx>;_e8-_ zkEaSQ`weO*to#C5d12zbRK_7GHd#ZQ9N(#j2aakG4E+H#ha*|!Wc8i&z5kRS)bQ`$wxFGO0`;jN$s$1NO^s(aK{ zg=R_snT)&~noL^D!cyT$C3uB0gLbsfq}KIFvL|36ij)OW1HvnU)@VL-NZsKy(pA9=?oZ4l*bFQL()+n~<*Et)v_ z3mzglCgMCh^T#GV(rbm5(`q5e3m8XhNN}Tv&d_7~GM=)wPH{E1k2CH77b2yAm`u@F zCLp6TmYO5EzdU*H;mT#syG8=Xg8VVAOab3ActIB^O)O|Ev?4k0rHp^K&xdKs9- z)dDVE){07-VjgfkWKpHNf)JmYM7Uy+V!NB+oOmQ2S;x!A;`lEVRuOUhElxl=1cN9P zoz7tqckL1g31{|9*vDIlMj>%J~8a73qIQDIrp?iIwZ5Vb;# z%0o+CUfOyYElrdv`xpUvi#zD`E+Ses>fc2@`orq_1ApD#tqp1eAJ(_JXXuELZgs~! zf?LBt4s&6t zCgp6F=#`2nefE{=mEE$Rl2%hh(X|=~thu^>;3DAaAy;P{7+1SyOR5tqNG;!`7s z>!>|LA9w|s(rHmATmMPB=Y7Mfqw}t9dXHF@4(#S5&;xrIRkY3KBS~YnepEPN$a`&E zv)K7vl3F=q^*w}1QiVDRnF2qDf1lA#Cl#U%9XwB$R?8(oj4rK~kH)0c;XSx;Qp564 zuzVO%D&tNsKCA60ZYC@A{`1dhTHceWFjpxVGBWA_q@q?Q&wV(hkajelT{@}fxN$%4#@X_?sDk_In&afvdcuB4(xYJMq<0Lrl6r&MzG}w@HPto~ zTYb7wFIDhWT9x-{q{{oFOH_4&EbMFQXs1HY>Nu@DVM(5VQl1I(vP3+!C*8N84Y;>^ q8``R66!0{gh`=)+yUthitnOPXkBbqXHCWSV)|ziLKWeVb&;BnDtmGj8 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e7f0691fdbf0d0b67777469965d37c237f30c21 GIT binary patch literal 7414 zcmbtZ&2!vFc1PoTFqq+R_$AU_FJ;Rgv%4m(-Ncm-$FWztUVliGywh`lg0z>uH- z%&>uxEf!dngsQSGKI|zuq$*-=${&%Wa>^~oRP`xUNoq^yR4!gw%Uyrs+es^Jdu^F-By(xU>!kDE zeA@N8>4LW)`A)K!E_q8b-%OU%6>ml6-Q--l>aC`0-kL18l4sIqy=P^)v{L-a9YLo#grS1@8r>M0#ZGUZ6BkR~7HYpDE#d*uA5K-N@X1=}_}t24^8$ z1ZT14yaLWrxD3v6&FO)&5}pI+T+R6&IIH0rIBPZM`=R!c(tBqA?f0@l#G|Yqy*kX2 z+w0eM;#V(j{O)=>45MVdKTJnK5#LCn_0QvCXZ_vr_I8|YzZ3Mqh~n*?BHwsGhtt!9 zemTwyVP72;@h}S#VO$@Nl4z^PgqmfboohTQco1h%cxjjo;%)F+?`5NLaV08(Fem~r zob9OagMQ%;s7w1=upNbO7e$oOpgmnIUK%FjG`kk?AW5R+GDZuy=u~fvvoj2T9ORLh z|L}v4A8R!K^Wo>GS(w45`S@@YA#Z&Yq*yWUsbcx#VL!MrY5mjmMihqlZdy=1@s)R@ zAVjANGyv9@54kV*%pcWF{LhCx3H?0Yr@=dNaEvkiSXC^A(I6Nngof~XA^wn8=1LXE_0RT=1C zDg!7+XxwReW@tv%t{qyE+EF8HL{8{XY2bv-(7mI3%`X*Q@mvT@Tey&r>4|I!voFt5 zv=Vs~ek#wPugxsVM&3dqUj82-g>tCeV@C>8O17((im&0VstF`^sOiiX+=tg7pkuF=8Mzrr|v6C5Kn7L)_Ma4}~~ z&WkmyDT~S>J1|NkH0zmuK{JKcfmyUm7TRIsP&=@SIU1ofN~<(O=PO!srfltbHNRxge4syPR(tRdh+sNYiw|{t`;uUvZRur$9`J-VTlbq+Vf<1#u zIeb_I{I(y)spqD_7tnn<5Jo;8L1b*-kF&V&eQzNfr+zTe7=H&+ntO8v#z}mt zk_^$@iNi3;q;Pr-;>j%UVmV%8+K#&@3w!V8Eq}y^ySzn310A7ZTAUT$5`?{oF?7Aq z0aSO$g$1R+8aQuLhsjq&=?&f>hK{qo0bO{TtP)^7c25X)*`(9aT46)Kx9e=Harrzci~~8w6C>J&ks2}uqBGHZsP)Q3Vq6!n3JLXy6z!E199SZ5BT>A}VO68tXsz>Y-^}skV zOJkRjNrfJ%pFrjh?1Khcs3n8c+eaE&fmhPoIejA0%gJV3<2(!O4&Ti&HgJky%{sO9mA$c6Spzv3STmb2{d5`iNL^g@MM&u%h zr$H_S^99=z?51G1JnLq)3!YVPhu0)Y^M?{Eg;~A%4`>8Q;9e-g8Vvd80T1_3$Z2od z)*|cTFc;}K+5Y0G1F0L|XIuAMq;^@5M!qi`pCEgjP~P?Zy>XCKHQc1x=-YXd2nmfa zAWZ!oVP2PZLs%bte2Gqv6H4+wq86=MX8Tc+`2Nh2zei1I-TWNL+ekU>ldVA59zoEK zO||_<-OyF_5k&DDRaYOXhWZ=b?0zkK^wId#L+hg_e;s7TV%Sf!7@>7X@hoW-Y@BXG znAoH%m#=M|X!9u(K;lP<_8}jEoS1+Df(3C?0Iz`gP#H5mUuu~SC}+FcA`C#GQ?Aq` zv=hL>FrY*{5>nz=7zH3NN6}#t*lwfLOHIPDBjMQC5>^I~%p(4PJV^d5h;VUGS>abU ztk>l0R6|F^Un2565i+Sl8}u($@Q2(FIjN85)Jh1<6B-BvS<31Z&^Q4Df1ijvwyDg? zDGtHU3CA4GRhOA6K?5N04bDqKg8hwaEVBK%YHO!Z&?lQQd1yl^T|&xf5~<5f>K;3Q zZN0}zln!7sNrR_+_L0KhMDw1;KfsG$A#%B5$d8b^_It!Vtu9hL{|fw^_Mi;;?I%t4#;UC?<{eu?IWFfoVvtU zkb%QuV`-Oz+Yx_~aMJ#fNYFd~1CUwagpiSM!r(WsugF(13kyi79#Y;e8q<6wX6n{` z=4sI=KP5~g`l3-SV#re|SE8NJqWlaDA05P81>KTMOsTG-K{k>MdM3YCXlXLKuRShk zXNcDkeT@r3mPu#!v8#62{`_P!rs8xKwDKRK-zl3xhfSv^<&5^IgsqUW2D?FWwhK8c zRj8$oebRyYByUs)PpPz4l+svH&`t$fs9Qb#tXUz8wp!?UF9x2RDY%JL8RnS^cbhTgmhHLGTM`=f^57Yy~X@ZWG4Oi zRhp?YV=qoXpO8v8x99>{In`uEgezTAzZl+x&FAz@o`ZSdG01DP!GmlY*H-CzSH8If zf?Sx@n_oj0{vi!DH;8e+_D_J~7gQx>@J}cwh5I~m|A>^I193Ey(NC97r>#1VY`CWG zv4n%uUn?3F43U@FnE0H?PCU$9`d+^UAPCMSIFRWv3c9eVyBZ0=ryYU@0IIpF02(Yn zh5^t(sniek17nXIN?fq0UujH?l*#`TBl1s($c_G#a=#?vgG_MZGhzfpZV(|SQTmL6 z#ln!=HKVgr2>2H?ld|Pl9hft6)&T&N9y*q;N(iX-XchwCn7&2hzfI(c#*N^T|AYuh zGXE)&sgw6)t4CV@_ef`<$lL@)%vsltdOR6L{u;G2D01#;uHETiz>`oUY{_=*s0Dn{ zqc(yHW;hpi5G4y%Y+b;1SIc7Q~S$S+m-fgLV~D|ZZUUSd%fShR-7%1->w zkY5VMd5~Pb^1nhP60;LTg8cy^L7*feP3h<}FmkBj!Vj5HZ&O43L#B2(#LWtNYiT_+mF_yF}ivof%sA^-U3 z(g8ei$)@V^-)Ji#)3M;D9_!lv@?(phs0ocA=H+(a%09i5e(Pc|T{1bt00$ZKf#0ej zHY9zDT@8!ZOMLZ12=Q}E>+&dXGC(NYKiCtVgaHnMWMR>A@qkR>5gdh=@y4*4Hr#{<&hhb28fCB z=vJH~uye|#qO0cg)Bg#DZG^b~mhh6!4dF2aq)fUURp)o813j-)mv7vStXl}0?9G=3 zxUb-v-jJ7Z2Ykj~+IzONa4mPr#t|;-$m^kY#Q5tJwnF{usE1Pq1a?bz*EndEtswY&gB$k(zQ!~6s-V&DqcG*& zta#)>PI+~3=?}r%U#J5~u$CKfcFP~|Xm8)xpa7a^oeGmu{3wk3h1a2rb|}NH2zx|4 z=Svt#SUegfh*JwY%EFwY*PMf2UF=9XCZUClxI-c%B6}d-a$X<~G#NGb(_l2B_-~Vf zyJ?Oi#+^^6iM^JL+J2r(NzXmLnW8jf5OPh4Ri6QM3$4PlI)O1 zde2kvZ==%iN$?NiI2&1$X54%6tcI1twtG-1NsgJUi);PTcjFBY6!04ZKG@+>8rY@@K>fV`j={VS>D#Mdt@GFx?_psu9n Zn5(<2LnU0rk<_l%eXILi_q)q4{|`s==(7L- literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0b07bd131b554f490256e6e219078b86c604f0f GIT binary patch literal 7879 zcmbtZ&2t>bb)WB<|Is;+V+RXH)ar%ECz=J($0E*20>C*?uS z+wRvN)7|gA-+TSgt5$st&lA7=-FoXrXLHHH}Pg4>6#|AKohz!Vm&beBQXOr zu>vcx13Pg7N2fM3b^{l`R_rET;4#mRONk%&Tz2AeQVA+tcH?SN3u;{U;+do#)VW-W zXOl+I;IbdjCG)|2vJfmJi@{=YDmcaba(p^j3YNHBiO(cwgR@+&#^;hJf+vzEgD1IO zi@%jD2g_WZiJwZI4xZ+6J$@!RADmB~4W8xtZ2VmEeDJ)kMNZ^yzCdrsSki*#-)f>E z=I&@>F7h^C+&6=lz?l~d;4F+d-v(z+9Wg{k>Kj zjHtW5k!LISNLo!>nQx`HmGw@yoh$2uZk8+WN)=(tpY{j&8e$AV#frwY46JxAHtnX;N_9k1k(Z z>&c{Psrm<)swFXaqbJ^OWsy2_Ig;pqsTU8D^lD4C;y8+nUTXHkA6!4q`lxsFC<`C( z)}uU>y?#U=yVgpu1=%#z;`Ls;bz|K7-Q-3j1kQw3IQq=x??o+vLFZ`#EI8}Qkk2#h zkGh06domVb*4?Ga>pVH;nC#wA3nJ>Y25}x1#~jAJmWZTU+USg?8XbSw%_G^$dlK_h zM~l2N$Z?cylFRhNb_PcFCP<_OI;_bk=_Nyyg&CQ`!fQvCa3VW$mb8xd3$3FIS9o`- zu&9#o?`Q#dmKKztHx=cQqFnyqs31esp^K)ycg=TII(K5xc zhuF@RDWA(7f%303;rzzj*MF|(?!JZ*p3txbckP~m`QN*11lEmiE0fDMD(iF>gMQeTy-issmeb~n@~^kzLG-SaJt;{}%E3p&P71rSvMBDzb70CRh&)Mz zR(e-gr7&#Att<<}yV_UpuMRS~8bdqd)wp|OwSODtkiNEZ;g!`4TJmzg)!u>;Wve@J zoU9fK8QqH7187fC%~rQ>M(IlbwtNQjKZ`e`@A|Hx`?`TY@1bSUFO@v2u2+oR`bY~_ z#(QnknT{VqZxZAsyxEH&L+uM~s2}K`7<0duvTop2ryZ zJP}$>(~|HJn$m9t?&vU;9?F-foyoVU)rBN(wxswjDwzjT!}4porSHy+7dz_FG+5kg zO`l|ywvs3el^>Fc4Pq)+!f<=gibpLblvPQ&vD2^fW!$mcz z3$09^0igjI6+F#w4BfhK7{=FzX*@J6?$;N%UmIPHJfuVPWD;cBeqq6lDGN4X z!Ip5ziX%7j$crp#!b80TtFEwBlQmZZUzDTrW<^w(R->w@MKv))wT>_9V)l*|%&^7R zVJ>s3O2;?B;Cpq*WsNmcmC`{HZUFW&bt-Ay3i~~%91JFm#`u^ipRhE{HvsXf!sQIA z3fN<9Zo9m>f{@UtZsN`U3FHt1fB;%(5D2ho4c$dJ|1Bds^}hCV^dQa| zvUHpJZ(3Y-a+hlkaex*M@WUoF$V)sz+jQokoWkN3LKYuasz#5uXk_E}XB0?)-IR(pq%%e%x_lY_31}xnE)`7DJ75c&&R(QTD=BvW4thIZj=l34rL44t~HwNie zrc6k)X-hgZWiwCy4a)LOA`FJLLR@bV<830eYWWUGU~X^i$W@e-zOD2vrSAmpR&fr2 zJ35e{OhONP94#oP_??0+Ml3?M1aC$sQ*-C_hVJR}dJTfvT{tR?F{YaGFVJ^wtvM?% zpdeo*@*0uvfGE3zSSBwJl=dzEj0ic6LqJtFd{TQuIUjTMq}(4|zeF;WCF(|NlCKjv z31j~p4RjT(Gzu`GVqs5^b$^YfW@K; z)CEc-&LZuYQ6)O&x8c{f)}S05FBgbOr#F7cA(7$|FZ&0OL&T)0X%j9Ln5TbEh-v2z zMN7imH}^~~TS7zGB7EeY9)3$hLODa&p9Awk{~NOq9u~c;!U4V-!u_1Eug2Ktt=-iF zGwNpre2^>(R5Op^iCjigQ|BEuXQm*pLPH4mR3mQPj%1i3*bgydH}0j%iqr6dauH!e zi6x6Gm?Ka`wYVY_?x=vUH)$Fg@+!Ml4y2Ts#C;gp3-q0jiE#LLh;}Ey*r({0Era-W z1Hn%PP&N%v4c9R|Xj|p$-Nr;{6OfW87^?DQlywC{4EZjmZByVoeyB|ECA`@uAaFU_ zK$DdruxN}+dq(ON<)M9G0C%`;QwJg$BtVGV+Jn%6QO0x+AQQ!1%-f(2AvmW6%;C)zLDuovpSXK^u6?F`rtcg7Xlz64SWRI3beM2` z#>y+~wC);*`zkbvlt}q+#K zW2052ahQ{F08pR_NnretYSLCN;pJ;JqXvY!2UOW}avcBmL zFyK&RT&$X-7{U*wca_bj$_jB3octTQF`W3Wg{Wj4HaZL{{{1?*1)LNo_ou)K-D0OO z9&V_&}>N;OQI3gk#V%5I^bv?<%Oc$@i^0H2eTZZG~hYTD85#Iz-cc&?8m{9 ziS5vvk*p74i7sX#d7~no1MkIv#G%S2g(V)>OSzA-yU$QWfkvTim8rPl(3R7JU_CVOeYbJE(HEWytCf$UL1B8`Bt6??P?z6?&Ee9;k?S){a7BOwo2 zBz2I0MIL)YmD+TRz&%;2GG$al(u)>S)iT=YX2@3@5eu7fvd@IAsroC4kR)-b9FtQ6e@}S#4os76L8gT_*8jV0iUlBGko#uGl zzJW2b9G5%eq*`Fye3jD^qa+*Z7w(Y@-<=yJCy*Ri>83kjM@HMb_6lW4RrN?Zy6GuP zAmJa>DM2PUy^ZU2Br|y$GbvX_{TTl%P+pXZjQ)clZ-Fm=Oym>@;yMbtWJV+>G9a=8 z5-esp5|-mxvoLA(`;=!dQxZMNAVm641ARECa&}yda^=eB9xVBwJX$dK^Rf-31;egT zHg_pjbJSi`g%x9zUVns@@I{iYk=8>}Jx{MZ@{qNtk*fv>G*I>!4&a8&qD%_v8^)*B zf9;+-ZfMifk1S+ON_=OnIrsmIa!+zgWQ}r($TktaTYpNW*NGf%mEnyekS#R)9&h$O z$T5Fa`KG_}zcR|irHrLx`45aT->)BlK0>}`#BV&1G;i}2<%Xf?wUJ77^4lc8;-*g? zK=S$tM4Xjk{K2y)bu8kumBU0nC)tDZ zkLx?WM{s9s$3DUHLo3rWEM>*SOXusM(sDDMrZ s7Dinp1k6@IEc=}9;f9aDg6gOBibXyjb%SW#Xqb&B8-LyS&f-h|3)Big5dZ)H literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43efbaed339b9acf1d0883903a7e62b8a596eaec GIT binary patch literal 8565 zcmb_hS#KOycCLNvh0P{KQj2X$UfT<4k7t~TN70NeTiztfGty`>HFhe^u4=N$s_N#g zYAu~65DXRAF9MRh%)=mnG;jJ9L4HMm`w$EeAb?)-ln0M2TlvnZ?q-uBG!IEN?&*6^ z-Tj{PopVohD-}<{^UUx5eZzlSQT~$((~p6|&yccDRYeIEUkTMvi`7K)wM6&z#PAK3 zczSI5CjO1sOf27Go*CPT<2#(UVmI-8kMnk1O3Hqj^G;kzs(zL8ZakCJ{2J%Ics8l~ z^<>VUOXmIgBYWz&{tp6rTX{go z)X=4Iv6JP}x!lV;-Lx4?^J>2rM{5mLYH12u;*QUE({^VA#mXmXub*Fv@@Ck~n_zfX zI&uHKG`!T0QL294+=#;W@;pkZKXB|{T)MK>6-mR8wTqoJY6>*o?1t|*vq+w}6baNn z-;Mi8`gv0{<2Z_oT5|U1pItl5`bGD~K^6vZHljQb-Cjh4U1=tmy=Z81;aa!VTp!o| zFj8 z_TxM#wl#>m%`g)3_-1?5)o8neP9BM7-WBMlGMeP&evYkd(K$>%rBNY!ADKw;Rj7^T zs$nVAB0bcRjK~Ph$c(JxN@#^@+xtpstDzk_w=2FKx}kSl@tv;}L-AdRN?AI5gqJRU zz9vhXoiL12UXq;cq?lC_g`H*|6)bP`zS+uUCFbJ_5G9cmkDsLm^)HZ(&0jxkgMJvWAUGsQ87SkbmFUdpWy`QKwr|Dx}y zzg2U4PeBVORImzr?SY2=FWlFBW4+VN#3__~BZKe_mVTLL{a&vt@+e%!%sba1{N423 z<-zHd<;$cTRBgGFE$>%rcg1qry|L18r3o(1%n5={+R1}Jl+m>;Z_uXndqGchw?vg# zZrHt%a#?!UnsGn+P>8M&4!yH5ZqQB}rZloBZj0k!iW5Xm5~11NSEU;St+<(GL2zGr z_}OYd6RR=AG+vE6>#Mz6(2Ml#mABqp%^)ss^qQ@0Xj8U&J&u#rLNcS9QL7IzDvH_a z4r%>L@0K`?0saChqmevYQ$1C~&-uV0J7GE8_*XXlKU{AvPW7d8F)B zkfCx{8LIcxFSK24S4RoQdQaU|k;6eUUj<*if(5QMba4i4#EV3p2Wc4MCHzVCrf-jS zRH}h!(0e8{gtT!SH<}{6h(bpDS;hRTs-X^Qli7}{h}ZDC;n8uW%_Ir}=>?=~{h0FQ zAlT_QN0 zh-@I?`QyJ{70==SLPPg)V0lyA3c^kz-UUw@>oB|xP5dPa;;%p&i(;AbgdhcMk=Xh( zIU2^nL@dOQiCf?2P9#K!gm8DhLjpuYNo%LDCpK!4`Sk4!Fb6>qKcQi0Tu}j$&JJqq zL#)*;3N!R>JohMB7bHH?Xy$p^@EB2wsnVp1rO@Jv&d*YgV33FL?YzfR5Ukd$<*eJZ z-d`eR*FZeiQt4M$%b=b*uh$LASMeiGRfXOIkLbr}`cQKUgaxub0xmG$K;E*7T83JF zG*~>Sb`!*v?pUKTSs9z;90n(o4)&wfY;8ue+96kq%rE;|2MjOSBhsMrw15_d?i!F&d+6NL(E_|7{d>ER@4F`Pp)=mF zG&Js6yO6uP*09WdSI~C_h@!7Oc#H?$jE>`O*n%bkOWKP&xp)tGLEek_kjMohAAmHf z#mRj{jE{+YLgZ5rUxyrsH;|L+j#Rg$dfm6Ti;eK@(T?~fI{%={ev`C{|0&$&Vp}8m4#$l?B<)3u|kQ8L>*mH;KGO zl@Fp8t|;PS^HwC*q}Aekg>+*o2c7U{p}1@@$P*bqpvJh0Vk%0BUS_9~f7O!EAHrs7Pg%abRx1Wk}ODZ!wWEBIa7cZPllLs2}qYu&1LeYVHIdV*9P%xB8=Jl zTXY0o{{hu9QYFuF;LFy4#bdNB)CLIrjpi0SLS{l^=dq;$jUNwAJRyM-Ko&<}l5|9r zbwaq4EfT-+LwQOJyWK>!y%5RIqKl}lRJ38Eq?+zX`>iE4E;{l`$$t zCJ+uM*0vZC&CmiO0v98D%YoBQ)Q#N8LoE-eXb=ZZIPeh&SY{`jOlDPjynB}@M6)JX z>{;R&>c#pyipFxE}Yriy^EN@S87`$K)mhb zCA=%|sUIkNxUiu`C9hI#q#3{;fm)3r%vfuw7;pS&$j}LrLzpdrY+(2=on19oFcI}@ zbx;2%Z3kk?a@A0IgAsetR^DWdzOPL!lSPlDPI_l!#GEKdb?(2ZXL4jLMbo^;*5kp; z`)dJN{)ogI>X{bY69?WUVV3oM&2TjD`vqe#cf>YGw*c2pm@y>_i># z5NTRA(s>$+Cc|)q81c1S- z+^M1uvYQq|nh8JB#M$$O+!PxaLy)veJ&sb*q2d*5g{Nl0`>O$`W?a>sMU?S)SdVH5CKyo-cs7sxu_E7gI9P4^vy5nUAleC8 zL}xB^#rbAGYsMEZ{aIY2EvLlM&3#;>OLIV0uZR%BN-~m><2G{Ixa{U0mG`K2i4dz0 zzVaayC_D+;VQF&QOw^GVZ6f4sNjolr9iqmitcV(mZL07xkqKDwwHV(Tl#HT+q^=f> zWDUQOF`Qcb1h2#qvVK50N%|2pULg4+_upiNJp+rX*_9dD% zVJn3L4Vd)76067bjK@ksiYRA!A1InLxpUG{jUO`4gpWMw1!Q>A$`*Uaa9eal0r733 zI1*jts39^CX%#{6Q7EvOZyzuGQX1Vv=$+6yJEZc`Dh>soS&BuJ&Cw!?JN5J;ikgTH zATlcJeBecF5CMIB2icTvi*92GObA(`2z`KMC%DVO!Monc5Z5|zICM>#ASWsI24YWt z0B+{AVg$GMRNyCW1jTXf8Nkoq03mUUMpVhZj=1m6?9c@F72LO7zEykRtKy}dQ$rgr zo;P&wp+52mi{DelJGgOMc>fOGBPs+8E)UCF=5A$J8J2;@r9E9mNPQ=sw75KKad~HU zV2#`SN=ZvpPgM|EG7v2bDAx6BWoH&@R1Qr9)b(+ndm8$i9o<_RCgQeqtUls49E@2* zVZz%kc$cSnqoB>_1@8>CbveGv&2ebdoZ0p`hJFsr!Q3bw1*^N#Nv{WO5$z1jm34&l z2i1{Qt`zbjD+fNRG#qJ!omTGGD7qW+M;gd!Br-w5C~Xn-VtnC}PLzfjefJW(;EUf7 zIR-KgKine5fXLqw8G`r=S&nG?xYsO5n!O$rLBfF$P26v2VAyvI>{s|3PdNJGBR#Ic z^5&OD6XtqOwuy)~ybx(}m0~sr^;DKw9E)bz>m=)EF&)P&JGi~z&f>riuabWN(5WNu zF#N#&>ExeNWC|WT+du;zy7u?Re+`~_LdmDc7-{F4xQTJr8gu`5yGZaQ7Kz*-@-~rw zAo4>Z`|D*~eU*4SAo!H4WF%!r7|imM2Ge_}m557`cgye|X(d`eoPUH!tO>vVi2H7= zNIM9^ZVMj-to#-oq|j(!LNmd)jH{GmXRn;Y!^;Q|NRkMY{WJ8zWu?H8@dpvS-V zFD(9xM^!A`!j=8+Q2c?&f8l9$$2WBjb3U~y_fCE$U%vg*PwBPzd{_84WV|@w%3rel z*PhaBUqOrbfS_3H(R(sl>ezgeG4pZ!IC&7{u7TvI%gqTxL=$jl+4JtG#2MoM=L{@%LP@J)UrWt2Fi$XAZc~I{%rkH J{oKOq{|js9^x6Ob literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb4c1a0ea6cb13ba704cab838d0f557e95d0c876 GIT binary patch literal 8390 zcmbta-ESP%b)WBmBjU3nYQC<;`yFTJ8>)FWX7M7>1sTioc2%4v=^UA=KQ&2-k+EGTKrsc);}xL_4r(J z-ajwXr{ZrVO~093@Gr>xMErd6g8zc5L`Gz8yhzWgHmCS6{iVVtS>ukv8j-c}^1kkW z8=NUN4bF7Qxd_e-n+0dK$%89~c^F|)7MAw#@7uWN=mwkV6ajlcD_g5C$-DHu)1e-KqH&vm<-M zVFuAvgE_2n$M7p(D2C#@kgKYwlCWTTlcj6dmxa}q+bC+W+>;=Yn<6Z1bDyVeeQZRO zw(!t9`wS#k_LT$mzM?7vb)W>=K;6*s*6}v*HgZ#PkXuukI;}8mpfUX(m07thYoP_x zm~oHVIeBGY!P{jD25K&Uso@*%JAYs9Ls?(MY5JCwJ=b{?#z51yx7n9VhjNzNNf5rrM)27>W6Vr!b$!(&4`mJlFKDbtYv#$Sf5DM z<*%SX)XGh2M{x|{I2_;0)QkqnPlGJt$)-UZMKg>?Smei+rajWM`lIs`vPK()M}E=- zf|K0@8B@#JAEcI!RL)9NVQ%D>O6syI^Pr?vsiYoczb+<+>omg6z#5pC_dwb#(BqfL9slnJq&MW&!1?YufV8fFAR?)8qu9w#au>6u5OxxG@^*tl6qh57jOe^~Y zAg3tD%qber1kyI;rztmy6!lx=8nPE4-9Ev_6!G!csHs!!4SiGRKh39^9LYZ^jOJwH z?f$d5uVTbAR0j{WqBWrMDL_;89&*8Ke!Aq#uK}QJlz9@i&g63_qjoe_jb;Wq&uX&t zx%@2ha@2U!3>;RMCFdlhom%GD8s$qO!XZ-?^xC;U12dC%!WhOY5BW~83h?RjHB`dy zKYrnR{VWQ$Hn#(U!S{CG9K*O~($DhdO4MYX?M^05{K`%_<7Ll0?}Z=0mm@eE!Fp^2MkMo zs&$^9r`7~ug`_qq)d2CQsA4#D0E+tlsT{M7gV8Ab%3)GA*o=09?J({~WrMlH22T_T z9pm#I8b?6GJv~0$AR3zdOxUpS7zYfV%WI3#$PR zN{z>4LyA9GlHYq4SroD@06?mNKj)zZF8z65@6I1XqC;y_8OlT^YC|Ez0!Nu}uz2mT zy&j2rhrC1bE2MkaCa4kt)w87ZM| z33XGb+rGV7EQD{DE8f+wRTs@qMqft{W%Z0Owa9XVo2 zYkUEXIN3J-Dv=Y6CMPF@e-}j9AKko4JH^T2l-85KPAMsPk{sI1KS9C!crxN2x0%|X z8L*y)u0fczhnlJV-mn^9acXtUj*|Y+FS`ZuN88aV*;VQC;&8S)w5Xo6sI=2{G4hw* ziP|_8@oV8u#Gh$Y$$6qRkw*2XznjBZk|s;fNdh%#_{`jbiBlzopDRtAo?~1vfiRr# za>+cp$2ir}-xW^U7M_1%B@pWj@0p!zz>TizJh(&~x|xv7M) zsq|Wah;fhqIZ8O`o2V4CMfbc(`BQ~b%CFEz#2ILEK8Dewk_V;BH^M0XG4*-^H2n;G z=}Wn$3s5Q`r~xq2RPXWbxuMXGFaHQh_zB8%1Sv>_>5)ir9f<)DVu%)dgb*t21KblF zi0&ESm;>#!f_G65Z@9Urcc7Q4QTf1J)gU3-GQ8Uph5;^ldi(~Fzb5htNXvRE2*9#O zS%<-A9aEu^QvV!{rE_mM1jq!v^k+SEEJJqz(s1~--LogI))6hQe=%0JLROEou*uQe#PB^A5=P^&+w=5Z(Qt=#G;+D68zN@4#=p zPoArrSMMtDZiyPqELGpy@Td={z9m(^CKcUYZj%Ez{Aoy#j!xmpWMRoa;c+Ssu`f;CB+aNmd#$DzQ_m0wc74v3{_n)BFt zygLdI9WK;J5f|Q&XW8ru62l$}jVGn!KP2)a5E#KjcW(j=;cio$rSM*w#PFo!73AGb zoXaE(`7U+^r#cy9GI^M;MgFAp`ifJ#1f5J+#k+v7zoM=T!YKYJk_GS(7#D{)ayNg0 zCnJ1vbxo5#%%cedKG7q~ATMZE+kNi1XpCkoDg_pHSSHn9REDlgCT;o=7$V=c{8A^ucFtxO_HH;1_J1e-$X1dwYQ z>6s$E>abaM`i|*OO6Q~j(=#tdVUhQ`{A$?G!uZ#8yoOKqHZq6cRmS4T*Er^# z;Bbm#z67;O?jmhVFGhMGh4_to7Kb*U7Fn1we8ne`=6_41NnATF4x9Xx%wTy&^iy4_ zm2%v0WpL!7xSWni4ekPpjnVlgs__jFVHMHjVK|Q3^BBahrqSn!VHc2lbP~cUw&rWp zlk@P*avq}--uOI<$~dPYQYjj8=ZokZg5$VNSQnKx9U5@n=Auf`?f^4SLfjqjh(ZzF z^vD*~@zv25{TniC2rK=ZV3RfrG30%9&myn7*tC5E5PB0Zi8CF-OZK+H=eH)APO0p9 zW=PHpTayD5^)BK(Sdphb7krg3ZoM?H;pe%7%6;^JG)(e+mA?rXwov{i%B_J7VD<*y zhPhW8)CL~Tiq(BxMJ#)3ZPen^vc;$PQrQCDy~hx}M)<01x38hiWNF%4CW5APiu4gi zkb!{@kQwnd3m%=~N4)KV_ff&SKy52Bv?`mEt5C*aeR~+Q{vMdSQ)T!9W_F>IZU?J8 z+S;8NW460lar9(I*jXnXvdx7Rgp@?>$fZTg5eDlZE;T_hS0*n(aC(n;#%J(dVe_aL z*lIbKLk&}~cp2V{K$^!RM)K6+zjRW?!ckm6ytK288_sUZ9qRPOXVo2sA!QuS zzwnHz#f?~T(NLwCklB*=MgFB{G&|J5;---k_MJS@a{NP^Nfy+9{TW>sE;dOy-H#qd zHaU6V(lGn4oz#5PqMvu-Y+;mzGD5E#^w|^F_n7(OB!PbOq;HfeDZju81|(YeHCZ={ zw)%9FT?p@Ysgw?+!YU2~GG-yeMDl#Y$=#6RH73(ix=E&m^9JMv#|Zi_10}l*Vl<|4 fYWASBo}_0~&mhkNIgRLq*3cVsjSm}_X21Qvtq=F2 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abc8ea52561a047712a93b13287735b6315b5014 GIT binary patch literal 8064 zcmbtZ&2t>bb)WB@o&CUK7vKC4MA5cZmH^4JlY`DY7F8~TRnmnApL|P!x#8d{{|C9IQn{v5)w!f9CsoR)+$4tZ@Ab@nfLyYa%53%P z?$;mF)BWD>y?)!NR2&7*nLqsd#=ovA%70O1{IOBFh?K0WisCCw@l{`o)L3I$tTR0} zn4uC+k4$DFHzG5(m?e2;RETY6%d!;}V~07iT!>0>nU!VPjw*4LRb{ytO~o}|zs6pR zm)WwcPerfC=h!(_*|V|ITJ%PIo}E{fUnuPI8YY98q9EpTT1S#V}^&JV$v^XI{t&pB^{v*0g+vzT+< z0cXiS1?{s);bNP1a@m9wV_>Y!jWYNmkbUIC47&pQs751my zH0-p!NSJqe-6&XVszPhGK`YnW-ClY#NIl<6Juqswy|^2N?Ts;B^_I60_?OZ&Xwx+4 z$OsMooY7Kc&8gQ4KZ`M6ZPWuEsuLq6ht?|cHnVQ%W}-Q+3`R0l0Z!V{FA%S zb(;RHbN?g@a~I@#0*-H)yW44b_Xa(##`gl>$3g{`m%r@h&jQaEOINO5yY%^uJMP_2 zFJJo5y>;ozmFsICxwo>C;XSEf!yrbhs<`%I)UE-TNIL`EC;K1x-!N`oX%_i&FP~ z5N>RyZq)IxzhZH7Js+0w6E{o)?xh`$xhwf%KkcR1RWNF!gSp&kuZJ7NyTyZ)d)OF1 zl%4`dt`qqb60Ne~7i8;cU%`RdYaCmxd`)Bn z3D1B2(~-uEd!d)`)2QBQ8qDDxz`#S-4`Wu2y$5c$lZ2$O!ng;OW0f#*+d=1no1_mT z67h1o7rWcL?k2P*5!ECLNuS*NVcYNAZ=Mlm((6LxZPyLkVd}d46y79Co1yOqZK)@` z0S;@>Jl+5vvgz1Viq%9eHOcu|aa5E8wu6W6t{3%!yuCgeFkF+4&*01S4x1*SPZQRT ze6=W`7wIZ1BrT5+i#DJexalU)jLna4fEx_JA`x~k30%2vuVY?b0TK2N=IZj!7Ozs( z>SPttL!pQ5R1CHj^KcS>Vc>JyfIA}zqIJ11@)%el?aDZmh%-KeWo`6EvD(=VX!mJ*FC!&wkiK%L^tC7IueE)1-|CxNgd3#uq>cK94kN5R)_$$;7xwLaXyu^+ z6e{%XCpt!e*QeashejQm#P1jTPQTPQo>=?P#zU)LmUCAycLjQ;uYG-t2OzCO(wYU{ z#2TOmFo-GtKFa(DMBXOy4w1J&npM6+<@bsFn8;6v{1ph(Ay`6ng}NiuZK3Y6!gjVD ztdQ>uE0H9*9ceKjc{1nk(F~IO2qjTi@9_H`_c^?7Oi}pc?qxbn zZc&em)T5G%c`Jw_K*VU}@DmkI%jZiVv?s}V5L?j<8f*0Ac|$Pq1cKJm~y=}E4DjG0APf$w4#*A3tHi!g*mX$T#hcL^4Is8ly^+*%uH z@(HU)n?i4=Jh=rjvU+J|@zgCy9&DZFtB(owt(2@9@iH4Y;M-{tJcrsya?!IS5nf;8 zXE84%ZyI-L+q7v;R|Afe!Ai`+5>#ZERS{ZZ|!Dmjci*VG0 z4sxVtl1eZx;fl*M8p@e`!y_~%%OH-fNo+6(4~FPKWeZ4AKHi%$aH|9silDo z9@;xfmyr_E8>!o>)IW6}J8`J?QQL=xA7W33eD$`%FQI!==Qoh?H6k}NhSWQ$az7>R zNcN=IT?FqBNC`no6__Y>W%lH%2bv{cQR1{ZPV&XC5}BY9{2DcUMC2lo>qJIx9!dSj zX!spcQUf_D^=BKLT3bjb9w)EyMHJwsjqUK2K>GJLyoUjQm6#eBN!y$-kdc@|;7HiC z6yhj_lxjw5j@2NtaKosdx>d}D)0w~<2<(w6W#~v{deBWy2&t4$(t1%O`$)b)TYj<7 zk9W+s4qp7`E_FG0`L$J_J8m0C991 zG7OnLu4+`Wk8M~$$nI++DHz|`kq}Qnb1r3i++GnTP?9oD(%FZdl5!23MKTS0h3$st zY7j1f1B94YkZ1jnXZ;TKyfkbFI%#W7W0F+?_ZN5JoVIuNB^x%bKm0>l%p_eP%=`y>4@t&JUC=5M zEdY&pseUAP(K==U{tp+Uz*$sqIVwGOIVww=Kt4cKOpQMECS9gqrEVk*Qr)&t7^&M= zbg$69l75P7ASX}XOf4!Q736INw&2kDQu$I{hjs9^M|N65Tlqk}q8tGKFrtzcx2n`1 zY0B57y{JihVXU?2GB9`uXh{$oNK)`Ouz)Y@eKl3SQoh3d@v-)MZAZiBsr={YgN!y9 zcK~hWT~6S6q>XG}c7!A#!ucRV0HM{1yBGcqbqS@GqG`^tb-efXa~n0(o|O-zr<+_r zVKAJ3MlYNmGi79kSh)+gKJ9GdLJ|6`gqpmAg#H8J?9zQ-nm|q%D$SlKOJ6=4l1M0F z4vq;9jSO{|+-q+HY+7D6vWtqui$qvi&OfKANPvVv@G=G>e@iV=FWx}uUyzbH5b0b) z3oPAGt>cXJlD*FX^w1gGY%7Q zm@m^?zN?uf;pVg}Bd_!2R`*xv_s>X^kY;*-G}gGUNh9d1Pdo!loE(BZhBOP`&DZo1 zf_WLz_!R_jU}}Lw)t~|#GJ-0?IhH@=*WjH_71iQ2f<23VJ*XjcXaqI%oI!auE3ex! zWN5N!iCJ|h^^zF#YOZzoWv`cb(T$t0LN1aw9Vz)O$OyS$uU>Ey;1Eds%5w{Ofxiwp zG`4Jtg}6?8cF>km^n!YniIW1Er9VvZmTJ~Fr5FeGFeM6Rcn1Gh;*!Sd9ljx>2BSN> z1PKWXGB)=-hHyGTVaiK0{{m&!=(Pz*0^c1$h-jo?Kp{SVEE*scLs(fTPgJ8UR*{GN z_%!CqeC;~TTF9IE6uqyIP@pv6WBBnN^&jHLpHWMeK?pyp@&Yb%ZQQMPtQmFOwvHX% z5KZ|c?R>Q_GK1G<7?OD(Pk;?0JW%;2+6NB9q}ioUEk9@UO&2jngVB*Tz$xG>j^P)a z8EN|X+6VeBL1bEP6@``txBnBjD{@)MjuI#&!+Uw_seVIBlvL|uwjc7aCJxa*~xqS&H) zFalCslqfXp;)`Psp>`hZh6#dhF9bG*0RJZ}b_1c%e*{RUIHoNae;9WF{Jurk$?V7u z48Z+|Ss24N`h{Iw*>)Oz6Wp^1cobz=!vSCAZ||3dQpbthJ;JkMl(%~qqHFaFU zC?LWRfs?_q59k;MyzPwlH{-lKsCEIoYfY8aco27Xv1#}ZB5dJ$yVuA=meBQ;u)XW9 z^I&IhpoS~CT2@X(;G0EZ_+cw$H40n!GI|fO3V6b2@g7m&K{xVR0Z&oOe40@55%~*{JHv@bV?!^g?9M@@sUd}4=PXRy23*pEf zQchSYU$pFJC!2`gz`+rw?2@hRq<)FA6r*fdGDm{?E|z1PHK>}6JBtnApj(V4BYYO9 zq1M2IV*;K5!>j$)`19Vm$qG3>d48W>;~l(Ut=ahho8toQ9{EbV4I=ae``oAMOK-;% z#{O@l$x=!~TwmfNEu6A;i zRCS38tRnj+!A>t|0~<(GSsDMrwG+Z^0*`sHMCp3m?eO#}=N--|fRF`x$oUjjcFT~l zNg2$LS19RxNU0k$IYXl<><=JWxSL#nJ}U`XL!&W+P<>e~Q torch.Tensor: + closest_power_of_2 = 2**math.floor(math.log2(total_num_heads)) + base = torch.tensor( + 2**(-(2**-(math.log2(closest_power_of_2) - 3))), + dtype=torch.float32, + ) + powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) + slopes = torch.pow(base, powers) + + if closest_power_of_2 != total_num_heads: + extra_base = torch.tensor( + 2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))), + dtype=torch.float32, + ) + num_remaining_heads = min(closest_power_of_2, + total_num_heads - closest_power_of_2) + extra_powers = torch.arange(start=1, + end=1 + 2 * num_remaining_heads, + step=2, + dtype=torch.int32) + slopes = torch.cat( + [slopes, torch.pow(extra_base, extra_powers)], dim=0) + return slopes + + +class BaiChuanMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class BaiChuanAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + hidden_size: int, + num_heads: int, + position_embedding: str, + rope_theta: float = 10000, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = hidden_size + tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( + ) + self.total_num_heads = num_heads + assert self.total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) + self.head_dim = hidden_size // self.total_num_heads + self.postion_embedding = position_embedding + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + # pylint: disable=invalid-name + self.W_pack = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_heads, + bias=False, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + # Create the alibi slopes and slice them. + if self.postion_embedding == "ALIBI": + tp_rank = get_tensor_model_parallel_rank() + head_start = tp_rank * self.num_heads + head_end = (tp_rank + 1) * self.num_heads + alibi_slopes = _get_alibi_slopes(self.total_num_heads) + alibi_slopes = alibi_slopes[head_start:head_end].tolist() + + scaling = self.head_dim**-0.5 + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes) + else: + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + ) + self.scaling = self.head_dim**-0.5 + self.attn = PagedAttention(self.num_heads, self.head_dim, + self.scaling) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.W_pack(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + if self.postion_embedding != "ALIBI": + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class BaiChuanDecoderLayer(nn.Module): + + def __init__(self, + config: PretrainedConfig, + position_embedding: str, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = BaiChuanAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + position_embedding=position_embedding, + rope_theta=rope_theta, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + ) + self.mlp = BaiChuanMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class BaiChuanModel(nn.Module): + + def __init__(self, + config: PretrainedConfig, + position_embedding: str, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + BaiChuanDecoderLayer(config, position_embedding, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class BaiChuanBaseForCausalLM(nn.Module): + + def __init__(self, + config, + position_embedding: str, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = BaiChuanModel(config, position_embedding, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + if name == "lm_head.weight": + # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to: + # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508 + # Distinguish between Baichuan and Baichuan2 by checking the + # vocab size. This is suggested by + # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704 + is_baichuan2 = self.config.vocab_size == 125696 + if is_baichuan2: + loaded_weight = torch.nn.functional.normalize( + loaded_weight) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + +class BaichuanForCausalLM(BaiChuanBaseForCausalLM): + """Baichuan 13B and Baichuan2 7B/13B.""" + + def __init__(self, + config, + linear_method: Optional[LinearMethodBase] = None): + if config.hidden_size in [4096,6656]: # baichuan2 7b 33b + super().__init__(config, "ROPE", linear_method) + else: # baichuan 13b, baichuan2 13b + super().__init__(config, "ALIBI", linear_method) + + +class BaiChuanForCausalLM(BaiChuanBaseForCausalLM): + """Baichuan 7B.""" + + def __init__(self, + config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__(config, "ROPE", linear_method) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py new file mode 100644 index 0000000..4adfb6b --- /dev/null +++ b/vllm/model_executor/models/bloom.py @@ -0,0 +1,330 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py +# Copyright 2023 The CacheFlow team. +# Copyright 2022 HuggingFace Inc. team and BigScience workshop. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only BLOOM model compatible with HuggingFace weights.""" +import math +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import BloomConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: + closest_power_of_2 = 2**math.floor(math.log2(total_num_heads)) + base = torch.tensor( + 2**(-(2**-(math.log2(closest_power_of_2) - 3))), + dtype=torch.float32, + ) + powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) + slopes = torch.pow(base, powers) + + if closest_power_of_2 != total_num_heads: + extra_base = torch.tensor( + 2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))), + dtype=torch.float32, + ) + num_remaining_heads = min(closest_power_of_2, + total_num_heads - closest_power_of_2) + extra_powers = torch.arange(start=1, + end=1 + 2 * num_remaining_heads, + step=2, + dtype=torch.int32) + slopes = torch.cat( + [slopes, torch.pow(extra_base, extra_powers)], dim=0) + return slopes + + +class BloomAttention(nn.Module): + + def __init__( + self, + config: BloomConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + self.total_num_heads = config.n_head + self.head_dim = self.hidden_size // self.total_num_heads + assert self.head_dim * self.total_num_heads == self.hidden_size + + tp_world_size = get_tensor_model_parallel_world_size() + assert self.total_num_heads % tp_world_size == 0 + self.num_heads = self.total_num_heads // tp_world_size + + self.query_key_value = QKVParallelLinear( + self.hidden_size, + self.head_dim, + self.total_num_heads, + bias=True, + linear_method=linear_method, + ) + self.dense = RowParallelLinear( + self.hidden_size, + self.hidden_size, + bias=True, + linear_method=linear_method, + ) + + # Create the alibi slopes and slice them. + tp_rank = get_tensor_model_parallel_rank() + head_start = tp_rank * self.num_heads + head_end = (tp_rank + 1) * self.num_heads + alibi_slopes = _get_alibi_slopes(self.total_num_heads) + alibi_slopes = alibi_slopes[head_start:head_end].tolist() + + scaling = self.head_dim**-0.5 + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + del position_ids # Unused. + qkv, _ = self.query_key_value(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.dense(attn_output) + return output + + +class BloomMLP(nn.Module): + + def __init__( + self, + config: BloomConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + self.dense_h_to_4h = ColumnParallelLinear( + hidden_size, + 4 * hidden_size, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size) + self.dense_4h_to_h = RowParallelLinear( + 4 * hidden_size, + hidden_size, + linear_method=linear_method, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.dense_h_to_4h(x) + x = self.gelu_impl(x) + x, _ = self.dense_4h_to_h(x) + return x + + +class BloomBlock(nn.Module): + + def __init__( + self, + config: BloomConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + + self.input_layernorm = nn.LayerNorm(hidden_size, + eps=config.layer_norm_epsilon) + self.self_attention = BloomAttention(config, linear_method) + self.post_attention_layernorm = nn.LayerNorm( + hidden_size, eps=config.layer_norm_epsilon) + self.mlp = BloomMLP(config, linear_method) + self.apply_residual_connection_post_layernorm = ( + config.apply_residual_connection_post_layernorm) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + + # Layer norm post the self attention. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + # Self attention. + attention_output = self.self_attention( + position_ids=position_ids, + hidden_states=layernorm_output, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + attention_output = attention_output + residual + layernorm_output = self.post_attention_layernorm(attention_output) + + # Get residual + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = attention_output + + # MLP. + output = self.mlp(layernorm_output) + residual + return output + + +class BloomModel(nn.Module): + + def __init__( + self, + config: BloomConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.embed_dim = config.hidden_size + + # Embedding + LN Embedding + self.word_embeddings = VocabParallelEmbedding( + config.vocab_size, + self.embed_dim, + ) + self.word_embeddings_layernorm = nn.LayerNorm( + self.embed_dim, eps=config.layer_norm_epsilon) + + # Transformer blocks + self.h = nn.ModuleList([ + BloomBlock(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + + # Final Layer Norm + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.word_embeddings(input_ids) + hidden_states = self.word_embeddings_layernorm(hidden_states) + for i in range(len(self.h)): + layer = self.h[i] + hidden_states = layer( + position_ids, + hidden_states, + kv_caches[i], + input_metadata, + ) + hidden_states = self.ln_f(hidden_states) + return hidden_states + + +class BloomForCausalLM(nn.Module): + + def __init__( + self, + config: BloomConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.transformer = BloomModel(config, linear_method) + self.lm_head_weight = self.transformer.word_embeddings.weight + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if name == "lm_head.weight": + continue + if not name.startswith("transformer."): + name = "transformer." + name + param = params_dict[name] + + if "query_key_value" in name: + # NOTE: BLOOM's fused QKV's output_dim has the shape of + # (num_heads * 3 * head_size), while the + # required shape is (3 * num_heads * head_size). + # Thus, we need weight conversion. + output_dim = getattr(param, "output_dim", None) + num_heads = self.config.num_attention_heads + if output_dim is not None: + loaded_weight_shape = loaded_weight.shape + loaded_weight = loaded_weight.view( + loaded_weight_shape[:output_dim] + (num_heads, 3, -1) + + loaded_weight_shape[output_dim + 1:]) + loaded_weight = loaded_weight.transpose( + output_dim, output_dim + 1) + loaded_weight = loaded_weight.reshape(loaded_weight_shape) + + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py new file mode 100644 index 0000000..20d0819 --- /dev/null +++ b/vllm/model_executor/models/chatglm.py @@ -0,0 +1,396 @@ +# coding=utf-8 +# Adapted from +# https://github.com/THUDM/ChatGLM2-6B +"""Inference-only ChatGLM model compatible with THUDM weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from torch.nn import LayerNorm + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs import ChatGLMConfig + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class GLMAttention(nn.Module): + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.multi_query_attention = config.multi_query_attention + self.total_num_kv_heads = (config.multi_query_group_num + if config.multi_query_attention else + config.num_attention_heads) + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = config.hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + + self.query_key_value = QKVParallelLinear( + self.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=config.add_bias_linear or config.add_qkv_bias, + linear_method=linear_method, + ) + self.dense = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=config.add_bias_linear, + linear_method=linear_method, + ) + + # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 + rope_ratio = getattr(config, "rope_ratio", 1.0) + max_positions = getattr(config, "seq_length", 8192) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim // 2, + max_position=max_positions, + base=10000 * rope_ratio, + is_neox_style=False, + ) + self.attn = PagedAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_ids: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.query_key_value(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(position_ids, q, k) + key_cache, value_cache = kv_cache + context_layer = self.attn( + q, + k, + v, + key_cache, + value_cache, + input_metadata, + ) + attn_output, _ = self.dense(context_layer) + return attn_output + + +class GLMMLP(nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + + self.add_bias = config.add_bias_linear + + # Project to 4h. + self.dense_h_to_4h = MergedColumnParallelLinear( + config.hidden_size, + [config.ffn_hidden_size] * 2, + bias=config.add_bias_linear, + linear_method=linear_method, + ) + + self.activation_func = SiluAndMul() + + # Project back to h. + self.dense_4h_to_h = RowParallelLinear( + config.ffn_hidden_size, + config.hidden_size, + bias=config.add_bias_linear, + linear_method=linear_method, + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel, _ = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output, _ = self.dense_4h_to_h(intermediate_parallel) + return output + + +class GLMBlock(nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.apply_residual_connection_post_layernorm = ( + config.apply_residual_connection_post_layernorm) + + self.fp32_residual_connection = config.fp32_residual_connection + + layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm + # Layernorm on the input data. + self.input_layernorm = layer_norm_func(config.hidden_size, + eps=config.layernorm_epsilon) + + # Self attention. + self.self_attention = GLMAttention(config, linear_method) + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = layer_norm_func( + config.hidden_size, eps=config.layernorm_epsilon) + + # MLP + self.mlp = GLMMLP(config, linear_method) + + def forward( + self, + hidden_states: torch.Tensor, + position_ids: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + # hidden_states: [num_tokens, h] + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output = self.self_attention( + hidden_states=layernorm_output, + position_ids=position_ids, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + layernorm_input = residual + attention_output + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + output = self.mlp(layernorm_output) + residual + + return output + + +class GLMTransformer(nn.Module): + """Transformer class.""" + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.post_layer_norm = config.post_layer_norm + + # Number of layers. + self.num_layers = config.num_layers + + # Transformer layers. + self.layers = nn.ModuleList( + [GLMBlock(config, linear_method) for i in range(self.num_layers)]) + + if self.post_layer_norm: + layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm + # Final layer norm before output. + self.final_layernorm = layer_norm_func( + config.hidden_size, eps=config.layernorm_epsilon) + + def forward( + self, + hidden_states: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + for i in range(self.num_layers): + layer = self.layers[i] + hidden_states = layer( + hidden_states=hidden_states, + position_ids=position_ids, + kv_cache=kv_caches[i], + input_metadata=input_metadata, + ) + # Final layer norm. + if self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states + + +class ChatGLMModel(nn.Module): + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + + self.embedding = VocabParallelEmbedding(config.padded_vocab_size, + config.hidden_size) + + self.num_layers = config.num_layers + self.multi_query_group_num = config.multi_query_group_num + self.kv_channels = config.kv_channels + self.encoder = GLMTransformer(config, linear_method) + + self.output_layer = ParallelLMHead(config.padded_vocab_size, + config.hidden_size) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + inputs_embeds = self.embedding(input_ids) + + # Run encoder. + hidden_states = self.encoder( + hidden_states=inputs_embeds, + position_ids=position_ids, + kv_caches=kv_caches, + input_metadata=input_metadata, + ) + return hidden_states + + +class ChatGLMForCausalLM(nn.Module): + + def __init__( + self, + config: ChatGLMConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config: ChatGLMConfig = config + self.linear_method = linear_method + self.transformer = ChatGLMModel(config, linear_method) + self.lm_head_weight = self.transformer.output_layer.weight + self.sampler = Sampler(config.padded_vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + try: + # torch < 2.0 do not have "remove_duplicate=False" in named_parameters + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_pos_emb.inv_freq" in name: + continue + if "word_embeddings" in name: + name = name.replace(".word_embeddings", "") + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + except: + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_pos_emb.inv_freq" in name: + continue + if "word_embeddings" in name: + name = name.replace(".word_embeddings", "") + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + try: + param = params_dict[name] + except: + assert name == "transformer.output_layer.weight" + param = self.transformer.output_layer.weight + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/cpm.py b/vllm/model_executor/models/cpm.py new file mode 100644 index 0000000..d693025 --- /dev/null +++ b/vllm/model_executor/models/cpm.py @@ -0,0 +1,368 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only LLaMA model compatible with HuggingFace weights.""" +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import (RMSNorm) + + +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, + ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs import CPMDragonflyConfig + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class CPMMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class CPMAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class CPMDecoderLayer(nn.Module): + + def __init__( + self, + config: CPMDragonflyConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = CPMAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + ) + self.mlp = CPMMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.scale_states = config.scale_states # hidden_states + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual, scale = self.scale_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual, scale = self.scale_states) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class CPMModel(nn.Module): + + def __init__( + self, + config: CPMDragonflyConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = getattr(config,"pad_token_id",None) + self.vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + CPMDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.scale = self.config.scale + self.scale_emb = self.config.scale_emb # embeding + self.scale_states = self.config.scale_states # hidden_states + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + if self.scale: + hidden_states = self.embed_tokens(input_ids) * self.scale_emb + else: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual, scale=self.scale_states) + return hidden_states + + +class CPMDragonflyForCausalLM(nn.Module): + + def __init__( + self, + config: CPMDragonflyConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = CPMModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + self.apply_inf = False + self.sampler_weight = None + self.scale_width = self.config.scale_width # output logits + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + self.apply_inf = input_metadata.is_prompt + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput: + # next_tokens = self.sampler(self.sampler_weight, + # hidden_states, + # sampling_metadata, + # apply_inf = self.apply_inf, + # index=1, + # skip_prompt=True, # skip prompt tokens when apply _apply_penalties function + # logits_scale=self.scale_width, # apply scale in sampler to avoid + # ) # an elementwise op on all outputs + next_tokens = self.sampler(self.sampler_weight, + hidden_states, + sampling_metadata, + logits_scale=self.scale_width, + ) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + self.sampler_weight = self.model.embed_tokens.weight if self.config.tie_lm_head == False else self.lm_head.weight \ No newline at end of file diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py new file mode 100644 index 0000000..abf4a46 --- /dev/null +++ b/vllm/model_executor/models/decilm.py @@ -0,0 +1,127 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 DeciAI Research Team. All rights reserved. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on MistralAI GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only DeciLM model compatible with HuggingFace weights.""" + +from typing import Optional + +import torch +from transformers import PretrainedConfig + +from vllm.config import LoRAConfig +from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) + + +class DeciLMForCausalLM(LlamaForCausalLM): + """ + Implementation for https://huggingface.co/Deci/DeciLM-7b-instruct. + Based on the llama executor. + + The main difference is that DeciLM uses Variable Grouped Query Attention. + The constant number of GQA heads in the decoder is overridden with a value + per layer. + + Usually, in the HuggingFace implementation, instead of + "config.num_key_value_heads", we use + "config.num_key_value_heads_per_layer[i]" which varies. + + Currently, PagedAttention does not work well with variable GQA, so we + normalize the weights upon loading, and use uniform GQA with the max value + instead. + """ + + def __init__( + self, + config: Optional[PretrainedConfig] = None, + linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + config.num_key_value_heads = max(config.num_key_value_heads_per_layer) + delattr(config, "num_key_value_heads_per_layer") + super().__init__(config=config, + linear_method=linear_method, + lora_config=lora_config) + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + + if "k_proj" in name or "v_proj" in name: + loaded_weight = self._degroup_weight(loaded_weight) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor: + hidden_size = self.config.hidden_size + head_size = self.config.hidden_size // self.config.num_attention_heads + target_num_kv_heads = self.config.num_key_value_heads + num_kv_heads = loaded_weight.shape[0] // head_size + n_repeats = target_num_kv_heads / num_kv_heads + assert n_repeats == int(n_repeats) + + n_repeats = int(n_repeats) + loaded_weight = loaded_weight.view(num_kv_heads, head_size, + hidden_size) + loaded_weight = torch.repeat_interleave(loaded_weight, + repeats=n_repeats, + dim=0) + loaded_weight = loaded_weight.reshape(target_num_kv_heads * head_size, + hidden_size) + + return loaded_weight diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py new file mode 100644 index 0000000..6dba952 --- /dev/null +++ b/vllm/model_executor/models/deepseek.py @@ -0,0 +1,444 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Deepseek model.""" +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + ReplicatedLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_reduce) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class DeepseekMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + reduce_results: bool = True, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method, + reduce_results=reduce_results) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class DeepseekMoE(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.n_routed_experts = config.n_routed_experts + self.top_k = config.num_experts_per_tok + if self.tp_size > self.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {self.n_routed_experts}.") + + self.experts = nn.ModuleList([ + DeepseekMLP(hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + reduce_results=False) + for idx in range(self.n_routed_experts) + ]) + self.pack_params() + + self.gate = ReplicatedLinear(config.hidden_size, + self.n_routed_experts, + bias=False, + linear_method=None) + + if config.n_shared_experts is not None: + intermediate_size = config.moe_intermediate_size * config.n_shared_experts + self.shared_experts = DeepseekMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + reduce_results=False, + ) + + def pack_params(self): + w1 = [] + w2 = [] + for expert in self.experts: + w1.append(expert.gate_up_proj.weight) + w2.append(expert.down_proj.weight) + self.w1 = torch._utils._flatten_dense_tensors(w1) + w1s = torch._utils._unflatten_dense_tensors(self.w1, w1) + for data, param in zip(w1s, w1): + param.data = data + self.w1 = self.w1.view(len(w1), *w1s[0].shape) + + self.w2 = torch._utils._flatten_dense_tensors(w2) + w2s = torch._utils._unflatten_dense_tensors(self.w2, w2) + for data, param in zip(w2s, w2): + param.data = data + + self.w2 = self.w2.view(len(w2), *w2s[0].shape) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + if self.config.n_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + # router_logits: (batch * sequence_length, n_experts) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = fused_moe(hidden_states, + self.w1, + self.w2, + router_logits, + self.top_k, + renormalize=self.config.norm_topk_prob, + inplace=True) + + if self.config.n_shared_experts is not None: + final_hidden_states = final_hidden_states + shared_output + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(batch_size, sequence_length, + hidden_dim) + + +class DeepseekAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class DeepseekDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + layer_idx: int, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = DeepseekAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + ) + if (config.n_routed_experts is not None and \ + layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): + self.mlp = DeepseekMoE(config=config, linear_method=linear_method) + else: + self.mlp = DeepseekMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class DeepseekModel(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + DeepseekDecoderLayer(config, + layer_idx, + linear_method=linear_method) + for layer_idx in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, + kv_caches[i], input_metadata, + residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class DeepseekForCausalLM(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = DeepseekModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, + cache_dir, + load_format, + revision, + fall_back_to_pt=False): + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_experts." in name) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_experts." in name) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py new file mode 100644 index 0000000..2b5e022 --- /dev/null +++ b/vllm/model_executor/models/falcon.py @@ -0,0 +1,447 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py +# Copyright 2023 The vLLM team. +# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights +# reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Falcon model.""" + +import math +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import LayerNorm +from transformers import FalconConfig as HF_FalconConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_reduce) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs import RWConfig + +KVCache = Tuple[torch.Tensor, torch.Tensor] +FalconConfig = Union[HF_FalconConfig, RWConfig] + + +def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: + closest_power_of_2 = 2**math.floor(math.log2(total_num_heads)) + base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))), + dtype=torch.float32) + powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) + slopes = torch.pow(base, powers) + + if closest_power_of_2 != total_num_heads: + extra_base = torch.tensor( + 2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))), + dtype=torch.float32) + num_remaining_heads = min(closest_power_of_2, + total_num_heads - closest_power_of_2) + extra_powers = torch.arange(1, + 1 + 2 * num_remaining_heads, + 2, + dtype=torch.int32) + slopes = torch.cat( + [slopes, torch.pow(extra_base, extra_powers)], dim=0) + + return slopes + + +class FalconAttention(nn.Module): + + def __init__( + self, + config: FalconConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.head_dim = self.hidden_size // self.total_num_heads + assert self.head_dim * self.total_num_heads == self.hidden_size + + self.new_decoder_architecture = config.new_decoder_architecture + self.multi_query = config.multi_query + + if self.new_decoder_architecture: + self.total_num_kv_heads = config.num_kv_heads + elif self.multi_query: + self.total_num_kv_heads = 1 + else: + self.total_num_kv_heads = self.total_num_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + + self.query_key_value = QKVParallelLinear( + self.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=config.bias, + skip_bias_add=True, + linear_method=linear_method, + ) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + + # Layer-wise attention scaling + self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) + self.reduce_row_parallel_results = not (config.new_decoder_architecture + or config.parallel_attn) + self.dense = RowParallelLinear( + self.hidden_size, + self.hidden_size, + bias=config.bias, + skip_bias_add=True, + linear_method=linear_method, + reduce_results=self.reduce_row_parallel_results) + + self.use_rotary = config.rotary + self.use_alibi = config.alibi + assert not (self.use_rotary and self.use_alibi), ( + "Rotary and alibi are mutually exclusive.") + + if self.use_rotary: + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, + "max_position_embeddings", 8192) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.inv_norm_factor, + num_kv_heads=self.num_kv_heads) + elif self.use_alibi: + tp_rank = get_tensor_model_parallel_rank() + head_start = tp_rank * self.num_heads + head_end = (tp_rank + 1) * self.num_heads + alibi_slopes = (_get_alibi_slopes(self.total_num_heads) * + self.inv_norm_factor) + alibi_slopes = alibi_slopes[head_start:head_end].tolist() + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.inv_norm_factor, + num_kv_heads=self.num_kv_heads, + alibi_slopes=alibi_slopes) + else: + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scale=self.inv_norm_factor, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, bias = self.query_key_value(hidden_states) + if bias is not None: + qkv += bias + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.use_rotary: + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + attn_output, bias = self.dense(attn_output) + return attn_output, bias + + +class FalconMLP(nn.Module): + + def __init__( + self, + config: FalconConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + + self.dense_h_to_4h = ColumnParallelLinear(hidden_size, + 4 * hidden_size, + bias=config.bias, + skip_bias_add=True, + linear_method=linear_method) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn("gelu", quant_config, 4 * hidden_size) + self.reduce_row_parallel_results = not (config.new_decoder_architecture + or config.parallel_attn) + self.dense_4h_to_h = RowParallelLinear( + 4 * hidden_size, + hidden_size, + bias=config.bias, + skip_bias_add=True, + reduce_results=self.reduce_row_parallel_results, + linear_method=linear_method) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # NOTE(zhuohan): Following huggingface, we do not fuse bias add here. + x, bias = self.dense_h_to_4h(x) + if bias is not None: + x += bias + x = self.act(x) + x, bias = self.dense_4h_to_h(x) + return x, bias + + +class FalconDecoderLayer(nn.Module): + + def __init__( + self, + config: FalconConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.self_attention = FalconAttention(config, linear_method) + self.mlp = FalconMLP(config, linear_method) + self.config = config + + if config.new_decoder_architecture: + # The layer norm before self-attention + self.ln_attn = LayerNorm(hidden_size, + eps=config.layer_norm_epsilon) + # The layer norm before the MLP + self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + else: + self.input_layernorm = LayerNorm(hidden_size, + eps=config.layer_norm_epsilon) + if not config.parallel_attn: + self.post_attention_layernorm = LayerNorm( + hidden_size, eps=config.layer_norm_epsilon) + + self.reduce_row_parallel_results = not (config.new_decoder_architecture + or config.parallel_attn) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + residual = hidden_states + + if self.config.new_decoder_architecture: + attention_layernorm_out = self.ln_attn(hidden_states) + mlp_layernorm_out = self.ln_mlp(hidden_states) + else: + attention_layernorm_out = self.input_layernorm(hidden_states) + + # Self attention. + attention_output, attention_bias = self.self_attention( + positions=positions, + hidden_states=attention_layernorm_out, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + if self.reduce_row_parallel_results and attention_bias is not None: + attention_output += attention_bias + + if not self.config.new_decoder_architecture: + if self.config.parallel_attn: + mlp_layernorm_out = attention_layernorm_out + else: + residual += attention_output + mlp_layernorm_out = self.post_attention_layernorm(residual) + + # MLP. + mlp_output, mlp_bias = self.mlp(mlp_layernorm_out) + if self.reduce_row_parallel_results and mlp_bias is not None: + mlp_output += mlp_bias + + if not self.reduce_row_parallel_results: + # When MLP and Attention layers are parallel, we can use + # only one all-reduce operator to reduce the results from + # both MLP and Attention layers. + mlp_output += attention_output + mlp_output = tensor_model_parallel_all_reduce(mlp_output) + if attention_bias is not None: + mlp_output += attention_bias + if mlp_bias is not None: + mlp_output += mlp_bias + + output = mlp_output + residual + return output + + +class FalconModel(nn.Module): + + def __init__( + self, + config: FalconConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.use_alibi = config.alibi + + # Embedding + LN Embedding + self.word_embeddings = VocabParallelEmbedding( + config.vocab_size, + self.embed_dim, + ) + + # Transformer blocks + self.h = nn.ModuleList([ + FalconDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + + # Final Layer Norm + self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.LongTensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.word_embeddings(input_ids) + for i in range(len(self.h)): + layer = self.h[i] + hidden_states = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + ) + hidden_states = self.ln_f(hidden_states) + return hidden_states + + +class FalconForCausalLM(nn.Module): + + def __init__( + self, + config: FalconConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.transformer = FalconModel(config, linear_method) + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + ) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.LongTensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer( + input_ids, + positions, + kv_caches, + input_metadata, + ) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + total_num_heads = self.config.num_attention_heads + if self.config.new_decoder_architecture: + total_num_kv_heads = self.config.num_kv_heads + elif self.config.multi_query: + total_num_kv_heads = 1 + else: + total_num_kv_heads = total_num_heads + num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + if "query_key_value" in name: + output_dim = getattr(param, "output_dim", None) + loaded_weight_shape = loaded_weight.shape + if output_dim is not None: + loaded_weight = loaded_weight.view( + loaded_weight_shape[:output_dim] + + (total_num_kv_heads, num_query_heads_per_kv_head + 2, + -1) + loaded_weight_shape[output_dim + 1:]) + wq = loaded_weight.narrow( + output_dim + 1, 0, + num_query_heads_per_kv_head).reshape( + *loaded_weight_shape[:output_dim], -1, + *loaded_weight_shape[output_dim + 1:]) + wk = loaded_weight.narrow( + output_dim + 1, num_query_heads_per_kv_head, + 1).reshape(*loaded_weight_shape[:output_dim], -1, + *loaded_weight_shape[output_dim + 1:]) + wv = loaded_weight.narrow( + output_dim + 1, num_query_heads_per_kv_head + 1, + 1).reshape(*loaded_weight_shape[:output_dim], -1, + *loaded_weight_shape[output_dim + 1:]) + loaded_weight = torch.cat([wq, wk, wv], dim=output_dim) + + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py new file mode 100644 index 0000000..0394813 --- /dev/null +++ b/vllm/model_executor/models/gemma.py @@ -0,0 +1,346 @@ +# coding=utf-8 +# Copyright 2023 The vLLM team. +# Copyright (c) Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Gemma model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import GemmaConfig + +from vllm.config import LoRAConfig +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class GemmaMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + self.act_fn = GeluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class GemmaAttention(nn.Module): + + def __init__(self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + max_position_embeddings: int = 8192, + rope_theta: float = 10000, + linear_method: Optional[LinearMethodBase] = None) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=self.rope_theta, + is_neox_style=True, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class GemmaDecoderLayer(nn.Module): + + def __init__( + self, + config: GemmaConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = GemmaAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + head_dim=config.head_dim, + max_position_embeddings=config.max_position_embeddings, + rope_theta=config.rope_theta, + linear_method=linear_method, + ) + self.mlp = GemmaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class GemmaModel(nn.Module): + + def __init__( + self, + config: GemmaConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + GemmaDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + # Normalize the embedding by sqrt(hidden_size) + hidden_states *= self.config.hidden_size**0.5 + + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class GemmaForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + # Gemma does not apply LoRA to the embedding layer. + embedding_modules = {} + embedding_padding_modules = [] + + def __init__( + self, + config: GemmaConfig, + linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + del lora_config # Unused. + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = GemmaModel(config, linear_method) + self.sampler = Sampler(config.vocab_size) + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.model.embed_tokens.weight, + hidden_states, sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params = set() + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + for (param_name, shard_name, shard_id) in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # GemmaRMSNorm is different from Llama's in that it multiplies + # (1 + weight) to the output, instead of just weight. + if "norm.weight" in name: + loaded_weight += 1.0 + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + unloaded_params = params_dict.keys() - loaded_params + if unloaded_params: + raise RuntimeError( + "Some weights are not initialized from checkpoints: " + f"{unloaded_params}") diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py new file mode 100644 index 0000000..661da0f --- /dev/null +++ b/vllm/model_executor/models/gpt2.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py +# Copyright 2023 The vLLM team. +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GPT-2 model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import GPT2Config + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class GPT2Attention(nn.Module): + + def __init__( + self, + config: GPT2Config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + total_num_heads = config.num_attention_heads + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) + assert total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = total_num_heads // tensor_model_parallel_world_size + self.head_dim = self.hidden_size // total_num_heads + self.scale = self.head_dim**-0.5 + + self.c_attn = QKVParallelLinear( + self.hidden_size, + self.head_dim, + total_num_heads, + bias=True, + linear_method=linear_method, + ) + self.c_proj = RowParallelLinear( + self.hidden_size, + self.hidden_size, + bias=True, + linear_method=linear_method, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scale=self.scale) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.c_attn(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + key_cache, value_cache = kv_cache + attn_output = self.attn(q, k, v, key_cache, value_cache, + input_metadata) + attn_output, _ = self.c_proj(attn_output) + return attn_output + + +class GPT2MLP(nn.Module): + + def __init__( + self, + intermediate_size: int, + config: GPT2Config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + self.c_fc = ColumnParallelLinear( + hidden_size, + intermediate_size, + bias=True, + linear_method=linear_method, + ) + self.c_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=True, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn(config.activation_function, quant_config, + intermediate_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.c_proj(hidden_states) + return hidden_states + + +class GPT2Block(nn.Module): + + def __init__( + self, + config: GPT2Config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + inner_dim = (config.n_inner if config.n_inner is not None else 4 * + hidden_size) + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = GPT2Attention(config, linear_method) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.mlp = GPT2MLP(inner_dim, config, linear_method) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_output = self.attn( + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + # residual connection + hidden_states = attn_output + residual + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + return hidden_states + + +class GPT2Model(nn.Module): + + def __init__( + self, + config: GPT2Config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + assert not config.add_cross_attention + assert not config.scale_attn_by_inverse_layer_idx + assert not config.reorder_and_upcast_attn + self.embed_dim = config.hidden_size + self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + self.h = nn.ModuleList([ + GPT2Block(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + for i in range(len(self.h)): + layer = self.h[i] + hidden_states = layer(hidden_states, kv_caches[i], input_metadata) + + hidden_states = self.ln_f(hidden_states) + return hidden_states + + +class GPT2LMHeadModel(nn.Module): + + def __init__( + self, + config: GPT2Config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.transformer = GPT2Model(config, linear_method) + self.lm_head_weight = self.transformer.wte.weight + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "lm_head.weight" in name: + # GPT-2 ties the weights of the embedding layer and the final + # linear layer. + continue + if ".attn.bias" in name or ".attn.masked_bias" in name: + # Skip attention mask. + # NOTE: "c_attn.bias" should not be skipped. + continue + if not name.startswith("transformer."): + name = "transformer." + name + param = params_dict[name] + # The HF's GPT-2 implementation uses Conv1D instead of Linear. + # Because of this, we need to transpose the weights. + # Note(zhuohan): the logic below might break quantized models. + for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]: + if conv1d_weight_name not in name: + continue + if not name.endswith(".weight"): + continue + loaded_weight = loaded_weight.t() + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py new file mode 100644 index 0000000..ef4c1d4 --- /dev/null +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -0,0 +1,279 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py +# Copyright 2023 The vLLM team. +# Copyright 2023 CTranslate2, and Michael Feil +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GPTBigCode model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import GPTBigCodeConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class GPTBigCodeAttention(nn.Module): + + def __init__( + self, + config: GPTBigCodeConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + total_num_heads = config.num_attention_heads + self.tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) + assert total_num_heads % self.tensor_model_parallel_world_size == 0 + self.num_heads = (total_num_heads // + self.tensor_model_parallel_world_size) + self.head_dim = self.hidden_size // total_num_heads + self.scale = self.head_dim**-0.5 + + self.multi_query = config.multi_query + if self.multi_query: + total_num_kv_heads = 1 + self.num_kv_heads = 1 + else: + total_num_kv_heads = total_num_heads + self.num_kv_heads = self.num_heads + self.kv_dim = self.head_dim * self.num_kv_heads + self.c_attn = QKVParallelLinear( + self.hidden_size, + self.head_dim, + total_num_heads, + total_num_kv_heads, + bias=True, + linear_method=linear_method, + ) + + self.c_proj = RowParallelLinear( + self.hidden_size, + self.hidden_size, + bias=True, + linear_method=linear_method, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scale=self.scale, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.c_attn(hidden_states) + q, k, v = qkv.split( + [ + self.hidden_size // self.tensor_model_parallel_world_size, + self.kv_dim, self.kv_dim + ], + dim=-1, + ) + key_cache, value_cache = kv_cache + attn_output = self.attn(q, k, v, key_cache, value_cache, + input_metadata) + attn_output, _ = self.c_proj(attn_output) + return attn_output + + +class GPTBigMLP(nn.Module): + + def __init__( + self, + intermediate_size: int, + config: GPTBigCodeConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + self.c_fc = ColumnParallelLinear( + hidden_size, + intermediate_size, + bias=True, + linear_method=linear_method, + ) + self.c_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=True, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn(config.activation_function, quant_config, + intermediate_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.c_proj(hidden_states) + return hidden_states + + +class GPTBigCodeBlock(nn.Module): + + def __init__( + self, + config: GPTBigCodeConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.hidden_size + inner_dim = (config.n_inner if config.n_inner is not None else 4 * + hidden_size) + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = GPTBigCodeAttention(config, linear_method) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.mlp = GPTBigMLP(inner_dim, config, linear_method) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_output = self.attn( + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + # residual connection + hidden_states = attn_output + residual + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + return hidden_states + + +class GPTBigCodeModel(nn.Module): + + def __init__( + self, + config: GPTBigCodeConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + assert not config.add_cross_attention + + self.embed_dim = config.hidden_size + + self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + self.h = nn.ModuleList([ + GPTBigCodeBlock(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + for i in range(len(self.h)): + layer = self.h[i] + hidden_states = layer(hidden_states, kv_caches[i], input_metadata) + + hidden_states = self.ln_f(hidden_states) + return hidden_states + + +class GPTBigCodeForCausalLM(nn.Module): + + def __init__( + self, + config: GPTBigCodeConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.transformer = GPTBigCodeModel(config, linear_method) + self.lm_head_weight = self.transformer.wte.weight + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "lm_head.weight" in name: + continue + if ".attn.bias" in name: + # Skip attention mask. + # NOTE: "c_attn.bias" should not be skipped. + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py new file mode 100644 index 0000000..5bab30d --- /dev/null +++ b/vllm/model_executor/models/gpt_j.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py +# Copyright 2023 The vLLM team. +# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GPT-J model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import GPTJConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class GPTJAttention(nn.Module): + + def __init__( + self, + config: GPTJConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.total_num_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + self.head_size = self.hidden_size // self.total_num_heads + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_size, + self.total_num_heads, + bias=False, + linear_method=linear_method, + ) + self.out_proj = RowParallelLinear( + config.hidden_size, + config.hidden_size, + bias=False, + linear_method=linear_method, + ) + + tp_world_size = get_tensor_model_parallel_world_size() + assert self.total_num_heads % tp_world_size == 0 + self.num_heads = self.total_num_heads // tp_world_size + + scaling = self.head_size**-0.5 + assert getattr(config, "rotary", True) + assert config.rotary_dim % 2 == 0 + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.rotary_emb = get_rope( + self.head_size, + rotary_dim=config.rotary_dim, + max_position=max_position_embeddings, + base=rope_theta, + is_neox_style=False, + ) + self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q, k = self.rotary_emb(position_ids, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + attn_output, _ = self.out_proj(attn_output) + return attn_output + + +class GPTJMLP(nn.Module): + + def __init__( + self, + intermediate_size: int, + config: GPTJConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.n_embd + self.fc_in = ColumnParallelLinear( + hidden_size, + intermediate_size, + linear_method=linear_method, + ) + self.fc_out = RowParallelLinear( + intermediate_size, + hidden_size, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn(config.activation_function, quant_config, + intermediate_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc_in(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.fc_out(hidden_states) + return hidden_states + + +class GPTJBlock(nn.Module): + + def __init__( + self, + config: GPTJConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner + self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + self.attn = GPTJAttention(config, linear_method) + self.mlp = GPTJMLP(inner_dim, config, linear_method) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_output = self.attn( + position_ids=position_ids, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + mlp_output = self.mlp(hidden_states) + hidden_states = attn_output + mlp_output + residual + return hidden_states + + +class GPTJModel(nn.Module): + + def __init__( + self, + config: GPTJConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.embed_dim = config.n_embd + self.wte = VocabParallelEmbedding( + config.vocab_size, + self.embed_dim, + ) + self.h = nn.ModuleList( + [GPTJBlock(config, linear_method) for _ in range(config.n_layer)]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.wte(input_ids) + for i in range(len(self.h)): + layer = self.h[i] + hidden_states = layer( + position_ids, + hidden_states, + kv_caches[i], + input_metadata, + ) + hidden_states = self.ln_f(hidden_states) + return hidden_states + + +class GPTJForCausalLM(nn.Module): + + def __init__( + self, + config: GPTJConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + assert not config.tie_word_embeddings + self.transformer = GPTJModel(config, linear_method) + self.lm_head = ParallelLMHead( + config.vocab_size, + config.n_embd, + bias=True, + ) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata, self.lm_head.bias) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "attn.bias" in name or "attn.masked_bias" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py new file mode 100644 index 0000000..8f7e106 --- /dev/null +++ b/vllm/model_executor/models/gpt_neox.py @@ -0,0 +1,294 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GPT-NeoX model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import GPTNeoXConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class GPTNeoXAttention(nn.Module): + + def __init__( + self, + config: GPTNeoXConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.total_num_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + self.head_size = self.hidden_size // self.total_num_heads + self.bias = getattr(config, "attention_bias", True) + + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) + assert self.total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) + + self.query_key_value = QKVParallelLinear( + config.hidden_size, + self.head_size, + self.total_num_heads, + bias=self.bias, + linear_method=linear_method, + ) + self.dense = RowParallelLinear( + config.hidden_size, + config.hidden_size, + bias=self.bias, + linear_method=linear_method, + ) + scaling = self.head_size**-0.5 + rotary_dim = int(self.head_size * config.rotary_pct) + assert rotary_dim % 2 == 0 + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.rotary_emb = get_rope( + self.head_size, + rotary_dim=rotary_dim, + max_position=max_position_embeddings, + base=rope_theta, + ) + self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.query_key_value(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q, k = self.rotary_emb(position_ids, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.dense(attn_output) + return output + + +class GPTNeoXMLP(nn.Module): + + def __init__( + self, + config: GPTNeoXConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.dense_h_to_4h = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + linear_method=linear_method, + ) + self.dense_4h_to_h = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn(config.hidden_act, quant_config, + config.intermediate_size) + + def forward(self, hidden_states): + hidden_states, _ = self.dense_h_to_4h(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.dense_4h_to_h(hidden_states) + return hidden_states + + +class GPTNeoXLayer(nn.Module): + + def __init__( + self, + config: GPTNeoXConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.use_parallel_residual = config.use_parallel_residual + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.attention = GPTNeoXAttention(config, linear_method) + self.mlp = GPTNeoXMLP(config, linear_method) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + attn_input = self.input_layernorm(hidden_states) + attn_output = self.attention( + position_ids=position_ids, + hidden_states=attn_input, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + if self.use_parallel_residual: + # pseudocode: + # x = x + attn(ln1(x)) + mlp(ln2(x)) + mlp_input = self.post_attention_layernorm(hidden_states) + mlp_output = self.mlp(mlp_input) + hidden_states = mlp_output + attn_output + hidden_states + else: + # pseudocode: + # x = x + attn(ln1(x)) + # x = x + mlp(ln2(x)) + attn_output = attn_output + hidden_states + mlp_input = self.post_attention_layernorm(attn_output) + mlp_output = self.mlp(mlp_input) + hidden_states = mlp_output + attn_output + return hidden_states + + +class GPTNeoXModel(nn.Module): + + def __init__( + self, + config: GPTNeoXConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + + self.embed_in = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + GPTNeoXLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_in(input_ids) + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states = layer( + position_ids, + hidden_states, + kv_caches[i], + input_metadata, + ) + hidden_states = self.final_layer_norm(hidden_states) + return hidden_states + + +class GPTNeoXForCausalLM(nn.Module): + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.gpt_neox = GPTNeoXModel(config, linear_method) + self.embed_out = ParallelLMHead( + config.vocab_size, + config.hidden_size, + ) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.gpt_neox(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.embed_out.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if ("attention.bias" in name or "attention.masked_bias" in name + or "rotary_emb.inv_freq" in name): + continue + param = params_dict[name] + + if "query_key_value" in name: + # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of + # (num_heads * 3 * head_size), while the + # required shape is (3 * num_heads * head_size). + # Thus, we need weight conversion. + output_dim = getattr(param, "output_dim", None) + num_heads = self.config.num_attention_heads + if output_dim is not None: + loaded_weight_shape = loaded_weight.shape + loaded_weight = loaded_weight.view( + loaded_weight_shape[:output_dim] + (num_heads, 3, -1) + + loaded_weight_shape[output_dim + 1:]) + loaded_weight = loaded_weight.transpose( + output_dim, output_dim + 1) + loaded_weight = loaded_weight.reshape(loaded_weight_shape) + + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py new file mode 100644 index 0000000..ebf1d8a --- /dev/null +++ b/vllm/model_executor/models/internlm2.py @@ -0,0 +1,325 @@ +# -*- coding: utf-8 -*- +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class InternLM2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.w2 = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.w2(x) + return x + + +class InternLM2Attention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.wqkv = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + self.wo = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.wqkv(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.wo(attn_output) + return output + + +class InternLMDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.attention = InternLM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + ) + self.feed_forward = InternLM2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.attention_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.attention_norm(hidden_states) + else: + hidden_states, residual = self.attention_norm( + hidden_states, residual) + hidden_states = self.attention( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.ffn_norm(hidden_states, residual) + hidden_states = self.feed_forward(hidden_states) + return hidden_states, residual + + +class InternLM2Model(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.tok_embeddings = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + InternLMDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.tok_embeddings(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class InternLM2ForCausalLM(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = InternLM2Model(config, linear_method) + self.output = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.output.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "w1", 0), + ("gate_up_proj", "w3", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + if "wqkv" in name: + config = self.config + kv_groups = config.num_attention_heads // config.num_key_value_heads + head_dim = config.hidden_size // config.num_attention_heads + loaded_weight = loaded_weight.view(-1, 2 + kv_groups, + head_dim, + loaded_weight.shape[-1]) + wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1], + dim=1) + wq = wq.reshape(-1, wq.shape[-1]) + wk = wk.reshape(-1, wk.shape[-1]) + wv = wv.reshape(-1, wv.shape[-1]) + weight_loader = param.weight_loader + weight_loader(param, wq, 'q') + weight_loader(param, wk, 'k') + weight_loader(param, wv, 'v') + else: + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py new file mode 100644 index 0000000..d35887c --- /dev/null +++ b/vllm/model_executor/models/llama.py @@ -0,0 +1,391 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only LLaMA model compatible with HuggingFace weights.""" +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import LlamaConfig + +from vllm.config import LoRAConfig +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class LlamaMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class LlamaAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + bias: bool = False, + sliding_window: Optional[int] = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=bias, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=bias, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class LlamaDecoderLayer(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + sliding_window = getattr(config, "sliding_window", None) + self.self_attn = LlamaAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + bias=getattr(config, "bias", False), + sliding_window=sliding_window, + ) + self.mlp = LlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class LlamaModel(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + self.layers = nn.ModuleList([ + LlamaDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class LlamaForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = LlamaModel(config, linear_method, lora_config=lora_config) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) + self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/llama_smooth.py b/vllm/model_executor/models/llama_smooth.py new file mode 100644 index 0000000..14707c8 --- /dev/null +++ b/vllm/model_executor/models/llama_smooth.py @@ -0,0 +1,409 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only LLaMA model compatible with HuggingFace weights.""" +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import LlamaConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import DequantSiluAndMulQuant +from vllm.model_executor.layers.attention import DequantPagedAttention +from vllm.model_executor.layers.layernorm import (RMSNorm, + RMSNormQuant, + AddResidualRMSNormQuant, + DequantAddResidualRMSNormQuant) + +from vllm.model_executor.layers.quantization.smoothquant import SmoothLinearMethod + +from vllm.model_executor.layers.linear import (LinearMethodBase, + QuantMergedColumnParallelLinear, + QuantQKVParallelLinear, + QuantRowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_dequant_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, + ParallelLMHead) +from vllm.model_executor.layers.layernorm import DequantAddResidual, AddResidual +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class QuantLlamaMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = QuantMergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method, + skip_bias_add=True) + self.down_proj = QuantRowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method, + skip_bias_add=True) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = DequantSiluAndMulQuant() + + def forward(self, x): + scale = None + # int, half -> int32 + gate_up, _ = self.gate_up_proj(x) + # int32 -> int, scale + x, *scale = self.act_fn(gate_up) + scale = scale[0] if scale is not None else None + # int8, scale -> int32(when tp > 1, to half, scale for dequant before all reduce) + x, _ = self.down_proj(x, scale) + return x, scale + + +class QuantLlamaAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QuantQKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + skip_bias_add=True, + ) + self.o_proj = QuantRowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + skip_bias_add=True, + ) + + self.rotary_emb = get_dequant_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = DequantPagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata + ) -> torch.Tensor: + # int8 -> int32 + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + # int32 -> half + q, k, v = self.rotary_emb(positions, q, k, v, + self.qkv_proj.q_dequant_scale.item(), + self.qkv_proj.k_dequant_scale.item(), + self.qkv_proj.v_dequant_scale.item()) + k_cache, v_cache = kv_cache + scale = None + # half - > int8, scale, 添加一个per channel 量化,并返回统计的scale + attn_output, *scale = self.attn(q, k, v, k_cache, v_cache, input_metadata) + scale = scale[0] if scale is not None else None + # int8, scale -> int32(when tp > 1, to half, scale for dequant before all reduce) + output, _ = self.o_proj(attn_output, scale) + return output, scale + + +class QuantLlamaDecoderLayer(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = QuantLlamaAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + ) + self.mlp = QuantLlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.apply_dequant_in_post = not linear_method.apply_dequant_after_row + self.input_layernorm = RMSNormQuant(config.hidden_size, + eps=config.rms_norm_eps) + if self.apply_dequant_in_post: + self.post_attention_layernorm = DequantAddResidualRMSNormQuant(config.hidden_size, + eps=config.rms_norm_eps) + self.finally_add_residual = DequantAddResidual() + else: + self.post_attention_layernorm = AddResidualRMSNormQuant(config.hidden_size, + eps=config.rms_norm_eps) + self.finally_add_residual = AddResidual() + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata + ) -> Tuple[torch.Tensor, torch.Tensor]: + # half + residual = hidden_states + # half -> int8 + hidden_states = self.input_layernorm(hidden_states) + # int8 -> int32 ,scale (when tp > 1,to half, scale, this scale is useless) + hidden_states, scale = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata + ) + + # to = 1: int32, half, scale -> int8, half (scale for dequant) + # tp > 1: half, half, scale -> int8, half + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual, scale) + # int8 -> int32, scale (when tp > 1,to half, scale, this scale is useless) + hidden_states, scale = self.mlp(hidden_states) + # ine32, half, scale -> half (when tp > 1, half, half, scale -> half) + hidden_states = self.finally_add_residual(hidden_states, residual, scale) + return hidden_states + + +class QuantLlamaModel(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + QuantLlamaDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata + ) -> torch.Tensor: + # half + hidden_states = self.embed_tokens(input_ids) + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata + ) + # int32 , half, scale -> int8 + hidden_states = self.norm(hidden_states) + return hidden_states + + +class LlamaForCausalLM(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = QuantLlamaModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # process special params first + ("qkv_proj.q_dequant_scale", "q_proj.dequant_scale", "-1"), + ("qkv_proj.k_dequant_scale", "k_proj.dequant_scale", "-1"), + ("qkv_proj.v_dequant_scale", "v_proj.dequant_scale", "-1"), + ("act_fn.gate_dequant_scale", "gate_proj.dequant_scale", "-1"), + ("act_fn.up_dequant_scale", "up_proj.dequant_scale", "-1"), + + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + special_params_mapping = [ + ("post_attention_layernorm.dequant_scale", "self_attn.o_proj.dequant_scale"), + ("finally_add_residual.dequant_scale","mlp.down_proj.dequant_scale") + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + if 'bias' in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader is default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight,shard_id) + break + else: + for (param_name, weight_name) in special_params_mapping: + if weight_name not in name: + continue + # used in o_prof and down_proj when world_size > 1 + if get_tensor_model_parallel_world_size() > 1: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader is default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight,shard_id) + else: + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader is default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight,shard_id) + break + else: + if 'bias' not in name: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py new file mode 100644 index 0000000..0100624 --- /dev/null +++ b/vllm/model_executor/models/mixtral.py @@ -0,0 +1,454 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Mixtral model.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import MixtralConfig + +from vllm.config import LoRAConfig +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_reduce) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class MixtralMoE(nn.Module): + """A tensor-parallel MoE implementation for Mixtral that shards each expert + across all ranks. + + Each expert's weights are sharded across all ranks and a fused MoE + kernel is used for the forward pass, and finally we reduce the outputs + across ranks. + """ + + def __init__( + self, + num_experts: int, + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: Optional[torch.dtype] = None, + tp_size: Optional[int] = None, + ): + super().__init__() + self.tp_size = tp_size or get_tensor_model_parallel_world_size() + self.num_total_experts = num_experts + self.top_k = top_k + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size // self.tp_size + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + self.gate = ReplicatedLinear(self.hidden_size, + self.num_total_experts, + bias=False, + params_dtype=self.params_dtype, + linear_method=None) + + self.ws = nn.Parameter( + torch.empty(self.num_total_experts, + 2 * self.intermediate_size, + self.hidden_size, + device="cuda", + dtype=self.params_dtype)) + self.w2s = nn.Parameter( + torch.empty(self.num_total_experts, + self.hidden_size, + self.intermediate_size, + device="cuda", + dtype=self.params_dtype)) + + set_weight_attrs(self.ws, { + "weight_loader": self.weight_loader, + }) + set_weight_attrs(self.w2s, { + "weight_loader": self.weight_loader, + }) + + def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, + weight_name: str, expert_id: int): + tp_rank = get_tensor_model_parallel_rank() + param_data = param.data + shard_size = self.intermediate_size + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + if weight_name.endswith("w1.weight"): + param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] + if weight_name.endswith("w3.weight"): + param_data[expert_id, + shard_size:2 * shard_size, :] = loaded_weight[shard, :] + if weight_name.endswith("w2.weight"): + param_data[expert_id, :, :] = loaded_weight[:, shard] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_size = hidden_states.shape + hidden_states = hidden_states.view(-1, self.hidden_size) + # router_logits: (batch * sequence_length, n_experts) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = fused_moe(hidden_states, + self.ws, + self.w2s, + router_logits, + self.top_k, + renormalize=True, + inplace=True) + + if self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(batch_size, sequence_length, + hidden_size) + + +class MixtralAttention(nn.Module): + + def __init__(self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + linear_method: Optional[LinearMethodBase] = None, + sliding_window: Optional[int] = None) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.sliding_window = sliding_window + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=int(self.rope_theta), + is_neox_style=True, + ) + self.attn = PagedAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class MixtralDecoderLayer(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 10000) + self.self_attn = MixtralAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + sliding_window=config.sliding_window, + linear_method=linear_method) + self.block_sparse_moe = MixtralMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.block_sparse_moe(hidden_states) + return hidden_states, residual + + +class MixtralModel(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.padding_idx = config.pad_token_id + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + self.layers = nn.ModuleList([ + MixtralDecoderLayer(config, linear_method=linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, + kv_caches[i], input_metadata, + residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class MixtralForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "embed_tokens", + "lm_head", + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = MixtralModel(config, + linear_method, + lora_config=lora_config) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) + self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + expert_params_mapping = [ + # (param_name, weight_name, expert_id) + ("ws" if weight_name in ["w1", "w3"] else "w2s", + f"experts.{expert_id}.{weight_name}.weight", expert_id) + for expert_id in range(self.config.num_local_experts) + for weight_name in ["w1", "w2", "w3"] + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, + cache_dir, + load_format, + revision, + fall_back_to_pt=False): + if "rotary_emb.inv_freq" in name: + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for param_name, weight_name, expert_id in expert_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + weight_name, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py new file mode 100644 index 0000000..a8dadce --- /dev/null +++ b/vllm/model_executor/models/mixtral_quant.py @@ -0,0 +1,412 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Mixtral model.""" +from typing import List, Optional, Tuple + +import numpy as np + +import torch +import torch.nn.functional as F + +from torch import nn +from transformers import MixtralConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + ReplicatedLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_reduce) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class MixtralMLP(nn.Module): + + def __init__( + self, + num_experts: int, + hidden_size: int, + intermediate_size: int, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.num_experts = num_experts + self.ffn_dim = intermediate_size + self.hidden_dim = hidden_size + + self.w1 = ReplicatedLinear(self.hidden_dim, + self.ffn_dim, + bias=False, + linear_method=linear_method) + self.w2 = ReplicatedLinear(self.ffn_dim, + self.hidden_dim, + bias=False, + linear_method=linear_method) + self.w3 = ReplicatedLinear(self.hidden_dim, + self.ffn_dim, + bias=False, + linear_method=linear_method) + + # TODO: Use vllm's SiluAndMul + self.act_fn = nn.SiLU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + w1_out, _ = self.w1(hidden_states) + w1_out = self.act_fn(w1_out) + w3_out, _ = self.w3(hidden_states) + current_hidden_states = w1_out * w3_out + current_hidden_states, _ = self.w2(current_hidden_states) + return current_hidden_states + + +class MixtralMoE(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.num_total_experts = config.num_local_experts + self.top_k = config.num_experts_per_tok + if self.tp_size > self.num_total_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {self.num_total_experts}.") + # Split experts equally between ranks + self.expert_indicies = np.array_split(range( + self.num_total_experts), self.tp_size)[self.rank].tolist() + if not self.expert_indicies: + raise ValueError( + f"Rank {self.rank} has no experts assigned to it.") + + self.experts = nn.ModuleList([ + MixtralMLP(self.num_total_experts, + config.hidden_size, + config.intermediate_size, + linear_method=linear_method) + if idx in self.expert_indicies else None + for idx in range(self.num_total_experts) + ]) + self.gate = ReplicatedLinear(config.hidden_size, + self.num_total_experts, + bias=False, + linear_method=None) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits, _ = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, + self.top_k, + dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + + final_hidden_states = None + for expert_idx in self.expert_indicies: + expert_layer = self.experts[expert_idx] + expert_mask = (selected_experts == expert_idx) + expert_weights = (routing_weights * expert_mask).sum(dim=-1, + keepdim=True) + + current_hidden_states = expert_layer(hidden_states).mul_( + expert_weights) + if final_hidden_states is None: + final_hidden_states = current_hidden_states + else: + final_hidden_states.add_(current_hidden_states) + + return tensor_model_parallel_all_reduce(final_hidden_states).view( + batch_size, sequence_length, hidden_dim) + + +class MixtralAttention(nn.Module): + + def __init__(self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + linear_method: Optional[LinearMethodBase] = None, + sliding_window: Optional[int] = None) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.sliding_window = sliding_window + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=int(self.rope_theta), + is_neox_style=True, + ) + self.attn = PagedAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class MixtralDecoderLayer(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 10000) + self.self_attn = MixtralAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + sliding_window=config.sliding_window, + linear_method=linear_method) + self.block_sparse_moe = MixtralMoE(config=config, + linear_method=linear_method) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.block_sparse_moe(hidden_states) + return hidden_states, residual + + +class MixtralModel(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + MixtralDecoderLayer(config, linear_method=linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, + kv_caches[i], input_metadata, + residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class MixtralForCausalLM(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = MixtralModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, + cache_dir, + load_format, + revision, + fall_back_to_pt=False): + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if ("block_sparse_moe.experts." in name + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py new file mode 100644 index 0000000..22a876e --- /dev/null +++ b/vllm/model_executor/models/mpt.py @@ -0,0 +1,298 @@ +# coding=utf-8 +# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main +import math +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs.mpt import MPTConfig + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +def _get_alibi_slopes( + total_num_heads: int, + alibi_bias_max: int, +) -> torch.Tensor: + next_power_of_2 = 2**math.ceil(math.log2(total_num_heads)) + m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32) + m = m.mul(alibi_bias_max / next_power_of_2) + slopes = 1.0 / torch.pow(2, m) + if next_power_of_2 != total_num_heads: + slopes = torch.concat([slopes[1::2], slopes[::2]])[:total_num_heads] + return slopes + + +class MPTAttention(nn.Module): + + def __init__( + self, + config: MPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.d_model = config.d_model + self.total_num_heads = config.n_heads + self.head_dim = self.d_model // self.total_num_heads + self.clip_qkv = config.attn_config["clip_qkv"] + self.qk_ln = config.attn_config["qk_ln"] + self.alibi_bias_max = config.attn_config["alibi_bias_max"] + if "kv_n_heads" in config.attn_config: + self.total_num_kv_heads = config.attn_config['kv_n_heads'] + else: + self.total_num_kv_heads = self.total_num_heads + assert not config.attn_config["prefix_lm"] + assert config.attn_config["alibi"] + + # pylint: disable=invalid-name + self.Wqkv = QKVParallelLinear( + self.d_model, + self.d_model // self.total_num_heads, + self.total_num_heads, + self.total_num_kv_heads, + bias=not config.no_bias, + linear_method=linear_method, + ) + if self.qk_ln: + self.q_ln = nn.LayerNorm(self.d_model) + self.k_ln = nn.LayerNorm(self.d_model) + self.out_proj = RowParallelLinear( + self.d_model, + self.d_model, + bias=not config.no_bias, + linear_method=linear_method, + ) + + tp_world_size = get_tensor_model_parallel_world_size() + assert self.total_num_heads % tp_world_size == 0 + self.num_heads = self.total_num_heads // tp_world_size + + if self.total_num_kv_heads >= tp_world_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_world_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_world_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + # Create the alibi slopes and slice them. + tp_rank = get_tensor_model_parallel_rank() + head_start = tp_rank * self.num_heads + head_end = (tp_rank + 1) * self.num_heads + alibi_slopes = _get_alibi_slopes(self.total_num_heads, + self.alibi_bias_max) + alibi_slopes = alibi_slopes[head_start:head_end].tolist() + + self.head_dim = self.d_model // self.total_num_heads + scaling = self.head_dim**-0.5 + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + del position_ids # unused. + qkv, _ = self.Wqkv(hidden_states) + if self.clip_qkv is not None: + qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.qk_ln: + q = self.q_ln(q) + k = self.k_ln(k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.out_proj(attn_output) + return output + + +class MPTMLP(nn.Module): + + def __init__( + self, + config: MPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.d_model + expansion_ratio = config.expansion_ratio + intermediate_size = expansion_ratio * hidden_size + self.up_proj = ColumnParallelLinear( + hidden_size, + intermediate_size, + bias=not config.no_bias, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn("gelu", quant_config, intermediate_size) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=not config.no_bias, + linear_method=linear_method, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.up_proj(x) + x = self.act(x) + x, _ = self.down_proj(x) + return x + + +class MPTBlock(nn.Module): + + def __init__( + self, + config: MPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + hidden_size = config.d_model + self.norm_1 = nn.LayerNorm(hidden_size) + self.attn = MPTAttention(config, linear_method) + self.norm_2 = nn.LayerNorm(hidden_size) + self.ffn = MPTMLP(config, linear_method) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + x = self.norm_1(hidden_states) + x = self.attn( + position_ids=position_ids, + hidden_states=x, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + hidden_states = hidden_states + x + x = self.norm_2(hidden_states) + x = self.ffn(x) + hidden_states = hidden_states + x + return hidden_states + + +class MPTModel(nn.Module): + + def __init__( + self, + config: MPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + assert config.embedding_fraction == 1.0 + assert config.norm_type == "low_precision_layernorm" + + self.wte = VocabParallelEmbedding( + config.vocab_size, + config.d_model, + ) + self.blocks = nn.ModuleList( + [MPTBlock(config, linear_method) for _ in range(config.n_layers)]) + self.norm_f = nn.LayerNorm(config.d_model) + if config.no_bias: + for module in self.modules(): + if hasattr(module, "bias") and isinstance( + module.bias, nn.Parameter): + # Remove the bias term in Linear and LayerNorm. + module.register_parameter("bias", None) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.wte(input_ids) + for i in range(len(self.blocks)): + block = self.blocks[i] + hidden_states = block( + position_ids, + hidden_states, + kv_caches[i], + input_metadata, + ) + hidden_states = self.norm_f(hidden_states) + return hidden_states + + +class MPTForCausalLM(nn.Module): + + def __init__( + self, + config: MPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + assert config.tie_word_embeddings + self.linear_method = linear_method + + self.transformer = MPTModel(config, linear_method) + self.lm_head_weight = self.transformer.wte.weight + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py new file mode 100644 index 0000000..9d56303 --- /dev/null +++ b/vllm/model_executor/models/olmo.py @@ -0,0 +1,380 @@ +# coding=utf-8 +# Adapted from +# https://github.com/allenai/OLMo/blob/v0.2.4/olmo/model.py and +# https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/modeling_olmo.py +# Copyright 2023 The vLLM team. +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +# +# BSD 3-Clause License +# +# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +"""Inference-only OLMo model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import nn + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size, ) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import ( + default_weight_loader, + hf_model_weights_iterator, +) +from vllm.sequence import SamplerOutput + +# this model must need this dependency +from hf_olmo import OLMoConfig + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class SwiGLU(nn.Module): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + @property + def output_multiplier(self) -> float: + return 0.5 + + +class OlmoAttention(nn.Module): + """ + This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + (plus another skip connection). + """ + + def __init__( + self, + config: OLMoConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.hidden_size = config.d_model + assert config.d_model % config.n_heads == 0 + tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( + ) + self.total_num_heads = self.config.n_heads + assert self.total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = self.total_num_heads // tensor_model_parallel_world_size + self.head_dim = self.hidden_size // self.total_num_heads + + # Layer norms. + self.attn_norm = nn.LayerNorm(config.d_model, + elementwise_affine=False, + bias=False) + # Attention input projection. Projects x -> (q, k, v) + self.att_proj = QKVParallelLinear( + config.d_model, + self.head_dim, + self.total_num_heads, + bias=config.include_bias, + linear_method=linear_method, + ) + + # Rotary embeddings. + if self.config.rope: + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, + "max_position_embeddings", 8192) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + ) + self.scaling = self.head_dim**-0.5 + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scale=self.scaling) + + # Attention output projection. + self.attn_out = RowParallelLinear( + config.d_model, + config.d_model, + bias=config.include_bias, + linear_method=linear_method, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.attn_norm(hidden_states) + qkv, _ = self.att_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + if self.config.rope: + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.attn_out(attn_output) + return output + + +class OlmoMLP(nn.Module): + """ + This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + (plus another skip connection). + """ + + def __init__( + self, + config: OLMoConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.hidden_size = (config.mlp_hidden_size if config.mlp_hidden_size + is not None else config.mlp_ratio * config.d_model) + + # Layer norms. + self.ff_norm = nn.LayerNorm(config.d_model, + elementwise_affine=False, + bias=False) + + # Feed-forward input projection. + self.ff_proj = ColumnParallelLinear( + config.d_model, + self.hidden_size, + bias=config.include_bias, + linear_method=linear_method, + ) + + # Activation function. + # self.act = SiluAndMul() + # self.act.output_multiplier = 0.5 + self.act = SwiGLU() + assert (self.act.output_multiplier * self.hidden_size) % 1 == 0 + + # Feed-forward output projection. + self.ff_out = RowParallelLinear( + int(self.act.output_multiplier * self.hidden_size), + config.d_model, + bias=config.include_bias, + linear_method=linear_method, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + # Add feed-forward projection. + # shape: (batch_size, seq_len, d_model) + og_x = x + x = self.ff_norm(x) + x, _ = self.ff_proj(x) + x = self.act(x) + x, _ = self.ff_out(x) + x = og_x + x + + return x + + +class OlmoBlock(nn.Module): + """ + This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` + (plus another skip connection). + """ + + def __init__(self, + config: OLMoConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + # Attention block. + self.attn = OlmoAttention(config, linear_method) + + # MLP block. + self.mlp = OlmoMLP(config, linear_method) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + # Attention block. + og_x = hidden_states + x = self.attn(positions, hidden_states, kv_cache, input_metadata) + x = x + og_x + + # MLP block. + hidden_states = self.mlp(x) + return hidden_states + + +class OlmoModel(nn.Module): + + def __init__(self, + config: OLMoConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + + self.transformer = nn.ModuleDict( + dict( + wte=VocabParallelEmbedding( + config.embedding_size or config.vocab_size, + config.d_model, + ), + ln_f=nn.LayerNorm(config.d_model, + elementwise_affine=False, + bias=False), + )) + + blocks = [ + OlmoBlock(config, linear_method) for i in range(config.n_layers) + ] + if self.config.block_group_size > 1: + raise NotImplementedError("Block group size > 1 not supported yet") + else: + self.transformer.update({"blocks": nn.ModuleList(blocks)}) + + if not config.weight_tying: + self.transformer.update({ + "ff_out": + ColumnParallelLinear( + config.d_model, + config.embedding_size or config.vocab_size, + bias=config.include_bias, + linear_method=linear_method, + ) + }) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + """ + :param input_ids: A tensor of shape `(batch_size, seq_len)`. + """ + # Get embeddings of input. + # shape: (batch_size, seq_len, d_model) + x = self.transformer.wte(input_ids) # type: ignore + + # Apply blocks one-by-one. + for block_idx, block in enumerate(self.transformer.blocks): + # shape: (batch_size, seq_len, d_model) + x = block( + positions, + x, + kv_caches[block_idx], + input_metadata, + ) + + # Apply final layer norm. + # shape: (batch_size, seq_len or 1, d_model) + x = self.transformer.ln_f(x) # type: ignore + return x + + +class OLMoForCausalLM(nn.Module): + """ + Extremely barebones HF model wrapper. + """ + + def __init__(self, + config: OLMoConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = OlmoModel(config, linear_method) + self.lm_head_weight = (self.model.transformer.wte.weight + if config.weight_tying else + self.model.transformer.ff_out.weight) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + input_metadata=input_metadata, + ) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights( + self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + ): + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + # attention + if ".att" in name: + name = name.replace(".att", ".attn.att") + # mlp + if ".ff" in name and "transformer.ff_out" not in name: + name = name.replace(".ff", ".mlp.ff") + # there is no bias in olmo + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py new file mode 100644 index 0000000..393b2dc --- /dev/null +++ b/vllm/model_executor/models/opt.py @@ -0,0 +1,354 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py +# Copyright 2023 The vLLM team. +# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights +# reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only OPT model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import OPTConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class OPTLearnedPositionalEmbedding(nn.Embedding): + + def __init__(self, num_embeddings: int, embedding_dim: int): + # OPT is set up so that if padding_idx is specified then offset the + # embedding ids by 2 and adjust num_embeddings appropriately. Other + # models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, positions: torch.Tensor): + return super().forward(positions + self.offset) + + +class OPTAttention(nn.Module): + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.embed_dim = embed_dim + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) + total_num_heads = num_heads + assert num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = total_num_heads // tensor_model_parallel_world_size + self.head_dim = embed_dim // total_num_heads + self.scaling = self.head_dim**-0.5 + + self.qkv_proj = QKVParallelLinear( + embed_dim, + self.head_dim, + total_num_heads, + bias=bias, + linear_method=linear_method, + ) + self.out_proj = RowParallelLinear( + embed_dim, + embed_dim, + bias=bias, + linear_method=linear_method, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + scale=self.scaling) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + key_cache, value_cache = kv_cache + attn_output = self.attn(q, k, v, key_cache, value_cache, + input_metadata) + output, _ = self.out_proj(attn_output) + return output + + +class OPTDecoderLayer(nn.Module): + + def __init__( + self, + config: OPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.self_attn = OPTAttention( + embed_dim=self.embed_dim, + num_heads=config.num_attention_heads, + bias=config.enable_bias, + linear_method=linear_method, + ) + self.do_layer_norm_before = config.do_layer_norm_before + + self.self_attn_layer_norm = nn.LayerNorm( + self.embed_dim, + elementwise_affine=config.layer_norm_elementwise_affine) + self.fc1 = ColumnParallelLinear( + self.embed_dim, + config.ffn_dim, + bias=config.enable_bias, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.activation_fn = get_act_fn(config.activation_function, + quant_config, config.ffn_dim) + self.fc2 = RowParallelLinear( + config.ffn_dim, + self.embed_dim, + bias=config.enable_bias, + linear_method=linear_method, + ) + self.final_layer_norm = nn.LayerNorm( + self.embed_dim, + elementwise_affine=config.layer_norm_elementwise_affine) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + # Self Attention + residual = hidden_states + # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention + if self.do_layer_norm_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata) + hidden_states = residual + hidden_states + # 350m applies layer norm AFTER attention + if not self.do_layer_norm_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention + if self.do_layer_norm_before: + hidden_states = self.final_layer_norm(hidden_states) + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + hidden_states = residual + hidden_states + # 350m applies layer norm AFTER attention + if not self.do_layer_norm_before: + hidden_states = self.final_layer_norm(hidden_states) + return hidden_states + + +class OPTDecoder(nn.Module): + + def __init__( + self, + config: OPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.word_embed_proj_dim, + ) + # Positional embeddings are replicated (not sharded). + self.embed_positions = OPTLearnedPositionalEmbedding( + config.max_position_embeddings, config.hidden_size) + + # Project out & in will be replicated if they exist. + if config.word_embed_proj_dim != config.hidden_size: + self.project_out = ReplicatedLinear(config.hidden_size, + config.word_embed_proj_dim, + bias=False, + linear_method=linear_method) + else: + self.project_out = None + + if config.word_embed_proj_dim != config.hidden_size: + self.project_in = ReplicatedLinear(config.word_embed_proj_dim, + config.hidden_size, + bias=False, + linear_method=linear_method) + else: + self.project_in = None + + # Note that the only purpose of `config._remove_final_layer_norm` is to + # keep backward compatibility with checkpoints that have been fine-tuned + # before transformers v4.20.1 + # see https://github.com/facebookresearch/metaseq/pull/164 + if config.do_layer_norm_before and not config._remove_final_layer_norm: + self.final_layer_norm = nn.LayerNorm( + config.hidden_size, + elementwise_affine=config.layer_norm_elementwise_affine) + else: + self.final_layer_norm = None + + self.layers = nn.ModuleList([ + OPTDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + inputs_embeds = self.embed_tokens(input_ids) + pos_embeds = self.embed_positions(positions) + if self.project_in is not None: + inputs_embeds, _ = self.project_in(inputs_embeds) + hidden_states = inputs_embeds + pos_embeds + + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states = layer(hidden_states, kv_caches[i], input_metadata) + + if self.final_layer_norm is not None: + hidden_states = self.final_layer_norm(hidden_states) + if self.project_out is not None: + hidden_states, _ = self.project_out(hidden_states) + return hidden_states + + +class OPTModel(nn.Module): + + def __init__( + self, + config: OPTConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.decoder = OPTDecoder(config, linear_method) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + return self.decoder(input_ids, positions, kv_caches, input_metadata) + + +class OPTForCausalLM(nn.Module): + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = OPTModel(config, linear_method) + self.lm_head_weight = self.model.decoder.embed_tokens.weight + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "lm_head.weight" in name: + continue + if name.startswith("decoder."): + name = "model." + name + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py new file mode 100644 index 0000000..d143261 --- /dev/null +++ b/vllm/model_executor/models/phi.py @@ -0,0 +1,305 @@ +# coding=utf-8 +# Adapted from +# https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py +# Copyright 2023 The vLLM team. +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +# +# BSD 3-Clause License +# +# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +"""Inference-only Phi-1.5 model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class PhiAttention(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.total_num_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + self.head_size = self.hidden_size // self.total_num_heads + + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) + assert self.total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) + + # pylint: disable=C0103 + self.qkv_proj = QKVParallelLinear( + self.hidden_size, + self.head_size, + self.total_num_heads, + bias=True, + linear_method=linear_method, + ) + self.dense = RowParallelLinear( + self.hidden_size, + self.hidden_size, + linear_method=linear_method, + ) + + scaling = self.head_size**-0.5 + rotary_dim = int(config.partial_rotary_factor * + (config.hidden_size // config.num_attention_heads)) + assert rotary_dim % 2 == 0 + + # pylint: disable=C0301 + # Refer to: + # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 + rope_theta = 10000 + max_position_embeddings = getattr(config, "n_positions", 2048) + self.rotary_emb = get_rope( + self.head_size, + rotary_dim=rotary_dim, + max_position=max_position_embeddings, + base=rope_theta, + ) + self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q, k = self.rotary_emb(position_ids, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.dense(attn_output) + return output + + +class PhiMLP(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + + n_inner = getattr(config, "n_inner", None) + n_inner = n_inner if n_inner is not None else 4 * config.hidden_size + + self.fc1 = ColumnParallelLinear( + config.hidden_size, + n_inner, + linear_method=linear_method, + ) + self.fc2 = RowParallelLinear( + n_inner, + config.hidden_size, + linear_method=linear_method, + ) + quant_config = getattr(linear_method, "quant_config", None) + self.act = get_act_fn(config.hidden_act, quant_config, n_inner) + + def forward(self, hidden_states): + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + return hidden_states + + +class PhiLayer(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.self_attn = PhiAttention(config, linear_method) + self.mlp = PhiMLP(config, linear_method) + + def forward( + self, + position_ids: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + attn_outputs = self.self_attn( + position_ids=position_ids, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + feed_forward_hidden_states = self.mlp(hidden_states) + hidden_states = attn_outputs + feed_forward_hidden_states + residual + return hidden_states + + +class PhiModel(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.linear_method = linear_method + self.embed_tokens = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.layers = nn.ModuleList([ + PhiLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.final_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + for i in range(self.config.num_hidden_layers): + layer = self.layers[i] + hidden_states = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + ) + + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states + + +class PhiForCausalLM(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.linear_method = linear_method + + self.model = PhiModel(config, linear_method) + + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + bias=True) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + head = self.lm_head + next_tokens = self.sampler(head.weight, hidden_states, + sampling_metadata, head.bias) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v") + ] + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # pylint: disable=E1136 + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py new file mode 100644 index 0000000..37af84c --- /dev/null +++ b/vllm/model_executor/models/qwen.py @@ -0,0 +1,288 @@ +# coding=utf-8 +# Adapted from +# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py +# Copyright (c) Alibaba Cloud. +# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE +"""Inference-only QWen model compatible with HuggingFace weights.""" +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class QWenMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str = "silu", + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.c_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.c_proj(x) + return x + + +class QWenAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + max_position_embeddings: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = hidden_size + tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( + ) + self.total_num_heads = num_heads + assert self.total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) + self.head_dim = hidden_size // self.total_num_heads + self.c_attn = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + bias=True, + linear_method=linear_method, + ) + self.c_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + self.scaling = self.head_dim**-0.5 + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.c_attn(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + + output, _ = self.c_proj(attn_output) + return output + + +class QWenBlock(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + self.attn = QWenAttention(config.hidden_size, + config.num_attention_heads, + config.max_position_embeddings, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + linear_method=linear_method) + + self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = QWenMLP(config.hidden_size, + config.intermediate_size // 2, + linear_method=linear_method) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + else: + hidden_states, residual = self.ln_1(hidden_states, residual) + hidden_states = self.attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.ln_2(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class QWenModel(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.vocab_size = config.vocab_size + + self.wte = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.h = nn.ModuleList([ + QWenBlock(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.wte(input_ids) + residual = None + for i in range(len(self.h)): + layer = self.h[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.ln_f(hidden_states, residual) + return hidden_states + + +class QWenLMHeadModel(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.transformer = QWenModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.transformer(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "w2", 0), + ("gate_up_proj", "w1", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py new file mode 100644 index 0000000..4d0e822 --- /dev/null +++ b/vllm/model_executor/models/qwen2.py @@ -0,0 +1,340 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen2 model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import Qwen2Config + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class Qwen2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class Qwen2Attention(nn.Module): + + def __init__(self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + use_sliding_window: bool = False, + linear_method: Optional[LinearMethodBase] = None, + sliding_window: Optional[int] = None) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.sliding_window = sliding_window if use_sliding_window else None + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=self.rope_theta, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class Qwen2DecoderLayer(nn.Module): + + def __init__( + self, + config: Qwen2Config, + layer_idx: int, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 1000000) + use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers + self.self_attn = Qwen2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + use_sliding_window=use_sliding_window, + linear_method=linear_method, + sliding_window=config.sliding_window) + self.mlp = Qwen2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class Qwen2Model(nn.Module): + + def __init__( + self, + config: Qwen2Config, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + Qwen2DecoderLayer(config, layer_idx, linear_method) + for layer_idx in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Qwen2ForCausalLM(nn.Module): + + def __init__( + self, + config: Qwen2Config, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = Qwen2Model(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + try: + param = params_dict[name] + except: + assert name=="lm_head.weight" # for qwen1.5 0.5b,skip this + continue + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py new file mode 100644 index 0000000..44c57e5 --- /dev/null +++ b/vllm/model_executor/models/stablelm.py @@ -0,0 +1,303 @@ +# coding=utf-8 +# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This code is based off the following work: +# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py +# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json +"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class StablelmMLP(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_up_proj = MergedColumnParallelLinear( + config.hidden_size, [config.intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=False) + self.act_fn = SiluAndMul() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class StablelmAttention(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + self.num_heads = self.total_num_heads // tp_size + + self.total_num_key_value_heads = config.num_key_value_heads + if self.total_num_key_value_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_key_value_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_key_value_heads == 0 + self.num_key_value_heads = max( + 1, self.total_num_key_value_heads // tp_size) + self.head_dim = self.hidden_size // self.total_num_heads + self.max_position_embeddings = config.max_position_embeddings + rope_pct = getattr(config, "rope_pct", + getattr(config, "partial_rotary_factor", 1)) + self.rotary_ndims = int(self.head_dim * rope_pct) + self.scaling = self.head_dim**-0.5 + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_key_value_heads * self.head_dim + self.qkv_bias = getattr(config, "use_qkv_bias", False) + if (self.head_dim * self.num_heads * tp_size) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads}).") + + self.qkv_proj = QKVParallelLinear(self.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_key_value_heads, + self.qkv_bias, + linear_method=linear_method) + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + self.hidden_size, + bias=False, + linear_method=linear_method) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.rotary_ndims, + max_position=self.config.max_position_embeddings, + base=self.config.rope_theta, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_key_value_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class StablelmDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.self_attn = StablelmAttention(config) + self.mlp = StablelmMLP(config, linear_method) + norm_eps = getattr(config, "norm_eps", + getattr(config, "layer_norm_eps", 1e-05)) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, + eps=norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states, residual + + +class StableLMEpochModel(nn.Module): + + def __init__(self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None) -> None: + super().__init__() + # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + StablelmDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + norm_eps = getattr(config, "norm_eps", + getattr(config, "layer_norm_eps", 1e-05)) + self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + ) + hidden_states = self.norm(hidden_states) + return hidden_states + + +class StablelmForCausalLM(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = StableLMEpochModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py new file mode 100644 index 0000000..1eda07b --- /dev/null +++ b/vllm/model_executor/models/starcoder2.py @@ -0,0 +1,310 @@ +# coding=utf-8 +# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Starcoder2 model.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) +from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +try: + from transformers import Starcoder2Config +except ImportError: + # fallback to PretrainedConfig + # NOTE: Please install transformers from source or use transformers>=4.39.0 + from transformers import PretrainedConfig as Starcoder2Config + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class Starcoder2Attention(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = self.hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = config.rope_theta + self.max_position_embeddings = config.max_position_embeddings + self.use_bias = config.use_bias + self.sliding_window = config.sliding_window + + self.qkv_proj = QKVParallelLinear( + self.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=self.use_bias, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + self.hidden_size, + bias=self.use_bias, + linear_method=linear_method, + ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=int(self.rope_theta), + is_neox_style=True, + ) + self.attn = PagedAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class Starcoder2MLP(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.c_fc = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + bias=config.use_bias, + linear_method=linear_method, + ) + self.c_proj = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + bias=config.use_bias, + linear_method=linear_method, + ) + self.act = get_act_fn(config.hidden_act, + intermediate_size=config.intermediate_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.c_proj(hidden_states) + return hidden_states + + +class Starcoder2DecoderLayer(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Starcoder2Attention(config, + linear_method=linear_method) + self.mlp = Starcoder2MLP(config, linear_method=linear_method) + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.norm_epsilon) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.norm_epsilon) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + # Self Attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Starcoder2Model(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # TODO: consider padding_idx (currently removed) + self.embed_tokens = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.layers = nn.ModuleList([ + Starcoder2DecoderLayer(config, linear_method=linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states = layer(positions, hidden_states, kv_caches[i], + input_metadata) + hidden_states = self.norm(hidden_states) + return hidden_states + + +class Starcoder2ForCausalLM(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.model = Starcoder2Model(config, linear_method=linear_method) + self.vocab_size = config.vocab_size + self.unpadded_vocab_size = config.vocab_size + if config.tie_word_embeddings: + self.lm_head_weight = self.model.embed_tokens.weight + else: + self.unpadded_vocab_size = config.vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + ) + self.lm_head_weight = self.lm_head.weight + self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if self.config.tie_word_embeddings and "lm_head.weight" in name: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py new file mode 100644 index 0000000..8d5fd82 --- /dev/null +++ b/vllm/model_executor/neuron_model_loader.py @@ -0,0 +1,66 @@ +"""Utilities for selecting and loading models.""" +from typing import Type + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import ModelConfig, DeviceConfig +from vllm.model_executor.models import ModelRegistry + +TORCH_DTYPE_TO_NEURON_AMP = { + "auto": "f32", + "half": "f16", + "float16": "f16", + "bfloat16": "bf16", + "float": "f32", + "float32": "f32", + torch.float16: "f16", + torch.bfloat16: "bf16", + torch.float32: "f32", +} + + +def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: + architectures = getattr(config, "architectures", []) + for arch in architectures: + model_cls = ModelRegistry.load_model_cls(arch) + if model_cls is not None: + return model_cls + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {ModelRegistry.get_supported_archs()}") + + +def get_model(model_config: ModelConfig, device_config: DeviceConfig, + **kwargs) -> nn.Module: + from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig + + parallel_config = kwargs.get("parallel_config") + scheduler_config = kwargs.get("scheduler_config") + + model_class = _get_model_architecture(model_config.hf_config) + linear_method = None + + # Create a model instance. + model = model_class(model_config.hf_config, linear_method) + + continuous_batching_config = ContinuousBatchingConfig( + batch_size_for_shared_caches=scheduler_config.max_num_seqs) + neuron_config = NeuronConfig( + continuous_batching=continuous_batching_config) + + # Load the weights from the cached or downloaded files. + model.load_weights( + model_config.model, + model_config.download_dir, + model_config.load_format, + model_config.revision, + tp_degree=parallel_config.neuron_tp_degree, + amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], + neuron_config=neuron_config, + context_length_estimate=[scheduler_config.max_model_len], + n_positions=[scheduler_config.max_model_len], + batch_size=scheduler_config.max_num_seqs) + + return model.eval() \ No newline at end of file diff --git a/vllm/model_executor/parallel_utils/__init__.py b/vllm/model_executor/parallel_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/model_executor/parallel_utils/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/parallel_utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..771a6f392b31c6f7673f88fa5243093696923e80 GIT binary patch literal 179 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;x_mera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_er|qBYEFD=MQURZuBT=lB)K5t(<>W#)@-k2IMN8|B+ zZ@=o>(Si7&cTn|*qC@dv?=bp?KfxVgFHZ7dYUfQ};UgcJ-jSgyKi#M76G!`0i;wcL zk4oOLAr161efl`)+dMYk&kum-ICuVnY0$hJcC?3yQM(u7x2v6+~ubr`xH0s zYPIv9(hb%0+-do-;AyuLiCQT)uC-Qk^F|n?xp_NG(%hcuq+z?|M|tU;7CO_IQn@4B zfk={Dvfb^Fh@jh9ZFJKxN@`|)qAAiwDq2ZfHsUrHQKREaKZ?+ierqW|`{neNwv2cq z3D-n^^2@oJv}}GROd4w4RuDuD(em#_f;Z-|DAsC7!MlNgZeu|=N!#)6!Q+7xekvN_ zeelOZHk8Q5yaZ#XNjut>ky~M_(@h(>B}LkmtpFH)_G9Rk@Lb1}cxX~>9r#{ivYcsm zwN&SN;kJSZPyQEJ4sx1=C@G^ zD`6D5_k` zL|lNdUF*jFs@s-I9Ccgmbhj36p#>Km(c)pNN&2iT2$JU(>$9Au6XFz5ny{qSSoWhX z4nI-j?QRO*x~nLk64a*WYAi1mn7938wG}klotiC&LCeoI-umf}W@g{MIop`KIem9# zw(<7NjhnX{cdpG|yL}s-TeCCo+>!ghD=XAgsUd$Xho~7vQ)6-ruiU`ek2E=f{^Ord zbrU%iwF5tzio$zSoz--q-FmhD%F9zpn2L)XKUngcBAHr_qIjyXa&cb--Lx&I2FNQs z!c@?X<8CVq6xg79y|bG4Z)2~2*tWVu&>|sKoDoyEnWNkIGj&UMm}yl%H%)m2#802% zmm55c1G2=yk1W!{qpb38=#W+RSjLoP4v~Fj+NQpx4QrWhYKFF?%ZXxSNlVVPV1qwF zWXkkU5GgRuwzN;QhvvHeA=$0Dq{*pFFZf^AV9Sc5XQalWnX!32GY&)kVhNAMOB+Vd z1f{%aXR!Tr2>!uRHf?S?+|2YX)-yA6b2u}2InzIeH_upRWcs9bK%3Vg34M+?FB9!B zJOuH>gWGFwkTYod3djViuk8%vejLi=LpPrVDPe;CFyn z0^=hO=?PUSwys{KZiCRfPwOGa3juxJyv@RmYNMS<*NdzJV zYB@=(e+BXiHNFe3q>jciY-Z}G^>N*NYO@Nn^$Na$!6%k!>({B8-TPoOkJZ8rRKf|`VeT0 z4{fksDJ$It_=@906Vwn^Ul^zf_z5|eCQS7Ga!*fj`VU}Kzh)n>y9H3-5YK|ca};tL zQMe@Ff?56LTyFsHx`HS7|*9wuB^F_HN5?*o1B}n=ze{eSG1VRoyhqr=~8y4ffnfWT3XS z%QVPK4|1o^?%^Y29V^uq^+mR50LtiX0)+5)@PL`ItR<81%J9H21@;h-{t%Z3?h=4% zY$&e?gJAN!%Abg0WdJm&@WK5Y%33@6L)rFu;3v-`ul{&^;1Qv)OUWwSqF$4yp`1e4 zBPL&^zPh6F_64Ycy!7tO?Cl%*Vai-PMTXo@EJ;+7SBbFgwDO^Q162lo?U`_O#Y>60 zwc?J{MbRC}H_@bcKBM4WH7ojrUNtBA1EYHqjhEvifY(gT;TVRFq{b6<2Em@R_T zC6v_VcfcsgDQ+Kwe4Qrm-M@T;n7&8NB{ccLXWMHO0+mMC>J?&nwpDqVJ_(+CH~IsN zeN5xur-r`r>YS=8-WI9P{S;XpQOGdz;XZOXN|#-`rew#)a~)6e7dVFhH_SGzEseaR z@}0j+DRH;gS@L~=wUi>+SO;JqGJj9FR&X#sVd;j9kSg+i(!EXb`!|P0jnY~y^ z-ordAD+A=9ptT>CRN`T6j%4Mu3Z8wLy*aw2t+RO;hpv5aJyW#I9(*ga7|X0Cj752I zg1?0|5AImAiWPqY11V2xy&*MYJR3r);cPI-?2!@i;ilem`qG>ovK(H)EGyJ{@T0w9 zCHq@@Wd9ZO_CwoI(wL9%DiWcc>+FMOqZ?+gk_}^L6;=4)w6gML?SnsN_OsqF^tQlN z**vhNL9>65X0@@k^Djbf0Rkj6Dhv1J>I?C^cI@W;4iHW?g};OOg4;w5vQ{oY=B}|z z?%I#3prO(PF)8SBzwTb|N-0`twCb)PM&vabeu1@mCN1ub#C=;-)}0$m zVl^+RlE$-&Bk;zh2$mbW_`H##l37536^X>F?EKv02X;D=LSln9`3LBgETgGf+b+;S zmPS#CAxlHHhP-SPIhw7HB8M}Lab#_yhBKg}M-=u8bHe&pG0%Ep8i3 zB$SjX$}%-70U1*nbbUv@EXk93r>W|y@29(Ls$L}T z>Am(f#60(p099&wZw~&&QV-fK%KGcYp2U5+O8obsK454(zga3ces*%TY|*aocU}l@+%}>c(wlNe=AVD?yTDmnlF{oRP#8 zm)s6#Eord=5=%f%MUhhvg#qd4TmPE^y!O;{Z$VJ^y`esv*aSG7oq03!=Dpwhy|>B0 zfNj7t`L}-Rx|H2<Jv z`LYt@EPS`fmLXWH$}MOIcp2Inx1k;6m5{kZu)fL%VEuq7@0so}r#@)oU*I({x<>gR zAKGW`ML}90@L@q)W36!*9pM-D*&cB(@li3sFT(qQxXj1kTjO-i=HvVltojhoT_bz~ z)=lae#xL^^_Y3Yv{5qfHAA!sjIA;QOyuuy$jl<#gGof9x-T@n*Y=J5E=H zQK&+ZI%|n^{3yC51@8u;VcXAz(@lk=#ZD)jFiw^5xs$9pDv`lPgNLb-;c8b2-T*l- z-@w`05U~^Z9o3a#yl&4vo11agrQg{|Z#t_Y@Nw_=ZlFa{M@X5-^gY{ws?CL!`FqY; zg{MDh+;P${4vsD`eW!8T;R)O`PLz{&JDo%-XK_buB(Za+@oB?eS(sb6=PZeK@)8u= z2*Dy@B=U3LtEZOShRfUkMxZw9B(s*Z5f?hpXK@fj@U1jwXP-RqgG~|h00qxJ;Eo*C zRpayc=P^>Un?e`cXdf#*h|B>*l)`Q{7Zf zMM>aCQ&G4&)zO!|)0nVPi(x=5#9Mp1hTHWZO3wnfl|+fH?S3BH681V}m+ zbUQnFx6#?r8jZm@DXL}}jOdZfOuOxjb!5UbvmM6J_}f3YDK5b${TVbTH!Xc*0G=r~ z_sg!yEneV7KrQ>mV1`=&E0nXbGv<39s*MK^qr_L!cbkE!_sg3CeDDMuOYcIXj6LIk z^huu{(m&B%bC)XGN3JmT%w5)J%51T&A;(mqRopfEW}mKMgwQ4!R(sv!YNsRWrK|v; z5i+~vtqbL$PxBeikGU5l-B@KMnDLd8SwXMNtktz#M~UjV19$`P!NVsDGb__~>xGP^ zB3jD|zLfq>R?vI6_UX=9<%|MvNLKc|AoA1H^KvJbVN*E_%}?P?krziumDr?6EYh2P zhq)T>R=+3(#A+NM7LOn7Q+Q|LO%YlO*mIA(B4DAOob5^}VioP&%vl1@bXMU$p^y%Q z#hFkO{3sgLhHzG97o8L?Cr~JCw*?P~uiXUsJFHiiR<;lOKgWUxi8BfHKDe)}90^gK+W^64BjUjrCgUXX>b<-S{+-v)@p zezdF?xD_1r^3`sYiht;9&n?5#vb+)Klr$e8rw&5Ug%sRHJMloA~C8%SyIufdT0zl`CpMRk#IOe4#3> z+QHxuSTNbEJnhIq5umF$UP4Fqal%jdX-a?-|tO2=|<9xI8+gi?{pGKW+9~P zFjit+$X)?ma=&-=oYc`IEKyqm>XI*S-*mns^{fbh03K@AhjQTCA=C_TJk$;H2ArIg z=eGmVLGH;akJ{*A%{%fZFzJ?LPDO4R@H3||CokYcEy$VC%Ym3j{x#gM;yqtjdNSuN zKXT`@(uSX6!jQLciXr>enZ(2>>)8Apn_pm~L-#a(M=*Fj$h3ige6TK)Zs#G0rFev0 zv?T1rvdu{IuO^8S{}m} zAVywOgcyvGUD}64+A<+^j`xiNhOj#%yM_Yj^#OE{N&#-3rOGWElB%gHeR2ul{TJi{ z6A~=z-JLlpHgjNUeHtkLgo2|R;&S;Bu1Lu%LAr^PL}abeMW zI{lp+<%S%`#kQsreV_mHoJ6YC42ma@(jv7ebvP0R^kwPYu^*15=R>hJ*u#0$u2Y)+AM3a5)a-{Cx>6o~5z zk1>nLd+-YtKldD~^&bR|GC&>D8#~Tfn7(z~Lgw!YNZs#NpoTToX^i(_H=U0dm7hWX zo#)Emz^wcTo2%IT*yV;!>b2Fp`2HFs^WBL=qL){osb9-Vp2w2_%j+NkF4{9jb;e9ta%ENm6v(L_U@XWBv7OfHr$*&45f(*jQ1hu}R-xtdW*&<=z4_R!Ok($X0 Tqoii#ykozen6!s$R<-ay^TYx2 literal 0 HcmV?d00001 diff --git a/vllm/model_executor/parallel_utils/__pycache__/custom_all_reduce.cpython-310.pyc b/vllm/model_executor/parallel_utils/__pycache__/custom_all_reduce.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..614bbddf0ae7fe184a8fd2375c4567107a134a4e GIT binary patch literal 4980 zcmb_gTW=f36`q;hrFau9%eE}v*N&Z(X(Hv)ph=pfc6^J9lrbX3MiwX*E6zw-dAX!# zmzE_^MOyixuLTO^At)gAsrm=~Ck6JYKpzS}^eI3e+NADxW=V>al00>(Jv)~(GiT0x z=Q3(fPL>&-3t#?q^)D`Cf2YdmQ$*!D-sC!GOftzM*5V%LR2GqFS)OIeR%ClN+N{WF z6}$=4W=F+V$t#(%6O~&PuhOb|RnE4G;4DOwt(sTk>;a1>ZKjTGzQY;)q~+D{PJPZ| zzQI&U9pB;Jw6uZ~aw0gn%4G3R+&d+&$+E0GwY?dXs&W#g(vIs24hoeT1evez%t2>N1-Ezinx;JO5^^YVh> za+SNoMuSc-PUp9zTa%qCJGLMNRSv`V`=ZlXMVh>tjK;!V~- zQXGDl>w;v9Eaf|Fo23Gj_qgN_g`TGN9#5^uTwhM@T}KMk3aPcj_Y7DtYN7{@9g)0h z`mH?!D6|*z_5dqDU3TLcP{+3ZeACG7V#`{|Zd18o;>s`ymLnxy*;Q`Zc7xdM#!0u+ zY3mfNMlmia=3V!ioO2hyqbEUEGUqO}!566*^QLQPaUGn_(Jl&TW19gOV<*X3$NPyV>q)w-W{FN?W(wC}=)R+`IQabML)z z&us>sU^$G!bo2iySiO{${hK%Zk8U(>eSBNfacT!7oADPvyLa!CPnT}r@*jNi>BqPH z#Rc#7Vphn(m)WF@%t}(775!!q`<*vB*|B_=HwMQ6h}s0Ptb72+eWp)ifq(n+-9*nv z?Pd_oN8$2(XER-E$8XNP{@Q#Jrs~xWb`KLs<{w8el*}JF zfVs|QRx`FT=+bAgX!0URQP{kIUy<8_%7P0~;PyARQ|45&pWAkg>oZ^-gFEVQLOI;O zCZmwN4@c2h+UMD%zpw~Z-dpvRVyesH+2GQa?e{*JMORp)cU` zy{j47_eJB5J_ojuNlB5~%v@2i9L)O;wO%CRf*hHZRx~DCMImKvC*@?ei)^1a>O!AK ztF93-vmX$%PBl#W%fJvm0GHW7mFp`s;v~qC^#(di$fcw(Uws*ceg6H6l*wVO1+k3O zhhR-eVw>x0_#ByixEnf=+yV7h&{xs&otvm(69kShh~Ng~9q^eoid+*w#?~O|X=-Xr z?t}b>ZJkNk4#gvOAM}B7bv$=rO9TI|>#10`cI`c;FZB4X(_?#F zpF^oYrF`SV_Lowz3-5lU{Y`MdyARq8IO-*{eFLh@fx$P|GP~K8fp?DZ*VS6ZsSn?y zg4O{Xv#DgQ-9^+J+I&_+Wa#^3d{(ovYIV|0KMEhJY-*&h8>1KCuVb%XsTuUqcUZf} z2Qv?6B}5oKL#M{Cl%ZMpk>_QrY}wzIOH|gcfolv^s2`x}WmJHwTFQ2Lj+s8L*sd60 z2z{nJK#G+WI-Bw1R^%0lE&$D5l}d|i-Bh+W;t|mgyoBQuOAG5Il-&_Fion8*&~KpH zq+=gE6bUML4FrCR0!c^5v@KGE?HzuL{pzi4t7rAtLrZ^Pq_=vaXYGk?yJsio9}xNh zf(PtT$;Rk)d-HwHwjE@7CsJp(AO(WGv%|kaUiU{6=pV4pVJq9tufanj^YR9Ay+5 zl{Y)a(s)dgEv7-Lyi()-FA-05Tk9!Imf2y9m_ChSy`WFwqi2Y`M14H;$`M8a&={II z$_zW?c#ru&{z@0Xiyss721X`VK+2+uBNQB_jj#QD3NWnE*D8J?z7hYi%lfCZ)Ns+U zVokDelwuxYnJ45l@_Ac2PaV%H;YNXs-9e5&A&bbr3-V;%f~=g{ZF666yon?2bbD~T zqPaW3Etp#a^U5Y;KV^a)()KTyra;XOY9-quJMp1DP%dFdzL1C&zGbLQ z_OQp2)A&vdEM8y2uzE2owt^=nk2i?PDkB)kY`=s2tB^+pJ%It7BC-!wa zvlA7qjOgFw3FG=lE@vX6@BaalB?MhcmGU3C3R`T=-ZV+~Fv*)649P4+k}-6V3doTd zH4xy96T$Y^p@?;;WZm9_Vdp;IF&ogYfK40GjD@_5Djf6s)cSKG;{qG@4DAW61Kb1o zTkc^AkHiJ5jfiJq#NeUw&<-dG=;44U*5wlCZjUTr-R^N%0q!e3pno8_0sb6@<$HLQ zN>OEM_oAhjEC5YqApvn~dgzFfIh zAJ@2Sb=rx_)yi|Vj!Ts}%?KR-B)P}{;St5wsl%!?JVyi>8Y|f{5;Hb5!^>!>b4^m} z4~ZOFbi#+9i+ulI)b|3&*6G7b_Bk-i+vwjHxx>h}S0R&&gO>7r+{EY>-;JnT_5DZP zAj*3*W!mPLvci(N?&oG=aAuCnh%@$@SiiK)P z-$qaUT2}7=>j9TBP3p)B>1GF6zP4xzLY8;jq+)Y8id2WoU0kQL%4o%=idUf<+gyH6 z)N~Nu>!xM=4mUS`hdZd6C}=LB<~r(4n8E%{gM+NH(4vcc9$G0*>pMgUKlz2i++U7S zm2Pkc33X;Kx7$&EPq{`tWaTCU)(cdk;Ahex6E4Y@r6%GS<$ae{v*LUCX?;LQPUwiL y6tWPHt73`^YYJanF}e7*=-7zLFmJ1Tw(QiLX}gAoGCx}pr;#8cNt~IQF8vQkZf6t# literal 0 HcmV?d00001 diff --git a/vllm/model_executor/parallel_utils/__pycache__/parallel_state.cpython-310.pyc b/vllm/model_executor/parallel_utils/__pycache__/parallel_state.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab5952c35b795ba819959566436c67031d1fb7c8 GIT binary patch literal 7106 zcmbtZNpBp-74B^oj!245zjCx#pT{@&keZeaRuY_!dMr1(EZ;*F6h|Gn56B?5bX> zUcL9drQR!gLqjp;aEM7oAERKj5(H|6};wbtNaZHS%KP1M*arB4932_qr3*wY`3H=daiqq&{ z6cgeM`cZLKyo~;+c*Q>EsOw|mRdMdVvZEHqMP8i8m*YaarQ|2;XKy=I-;<_Y6=tnm zbKG*(F>AK8UDt8VRq2H_zwj;aP!{rPlq-d`Eh`MluK%rychPvj)2)gp9M@{jv;1=1 ziB2@>_57NaP7um!Nh8|H&j9)?u323EztIHBy81wSsQ8%=l`m9&(jO|@T0?6n4fTk! zp-VF`gxb))hC&(&TD_s7HNMI0s9)fHLm5}Lb@VEFgDIpyAr+)WDpdR{A1WeEBZEvs z-%+nCpN?&(ai4A|N0dfdWR5Cyx1()mcy6|ld8lq@8>vR-t|FHjSv&pNTLF{|TAzSc zHlbxOtO)~x^ ziZRmk*SydbX2srcOh1&+vskxF1gf;F8$NSr<2Aq~E%Ig-96o`ZNu6VltcMV6D@PrmwQXCwV3F0_@9t!(4V;53_Mk%dM29 z9}o#r74yPe!2&U0`byVm-zj+rq{JGJtq*Q`(iI#J&2#la-YokjY}Rx>3t`E$M&I~} z@e_ITUduRoMN#wI6E(k3zu4)$Ejy2Gj8?Q=7^B!S%dLwiB@kl#M@tSf<`ov>Q+NC#$Z|0a*-I`@S_yJ zf27DGSntU%r$S#&xgJb@$}KNX)wY5)uX?F4ePPNk2M#>Gv|+D0{?uL9txUz6sdLXM zg}`~L88hO2xBch`2rz}(Ry5Xzy^Fh1EbcF%89ts=S^l)vHaT?~WxVpyy=w$F0sJXcy-a}eBqXoU>epu1oYDpx9&an0UNq(F(qU}C z9efrRAm>TfR`$nQap$5fi+v{;4m z1RnazEsx_XWSnWr*iyptOL<}cxjp6Vr@`p-eiqM?{tBGSlhmB*C83U&WP*=l6I@r< zH44%~{cUW5SC_b~GueAMAjRbcQmK8kRd}Sz7eF;{NCH_>AtXP7(8kafqlcnn{^ z#PyhLF%4%*e<$hP9-TBfmM`HeoaD{^9PW#BL7GHh4~GO(PY%cM;Wza0G&S_}Ox`Xf zyj8a1l$0y8XF#`?twCCk?6VhJ(W(BXPA>L)%=0PGjA(KK-7|T<(ue2g1(Uh%2WG^( zT9agVv?ad3B6%6R1G_BA{xHOLywiUA7ZB9_e`9J}YNv*HW+HNUyseGdnVAb`O zk*st4DR~kVUkynDiYzrddVsFqE`WOXnqwXXM@5TnX2)&9crkeCh zj*lcLhM}psW-wrgN+G~RotI#=JLg$=a(=P*c~rB8>Im(U4xcEIi=;i&kh1T3Zf|yTHImhVaP$$S&@wm&92pb*OEG|xa zdf@|Xkoj-pM6Us?aK@&NHj#R#4^qi7g6n#l*#2Q>Nm|flU<+*w9_cX#XTS-E-$z<; zguEDKYAfrLhv;iNC3rR*NVj7s|Ad*3$!+7DAdP%zM3v|9;)eoqtP_xT_X7myPbEj7 z*!W(E$a5rYo|@;87>nFx&68lcIb1rlFTr!ttre_Jse9@H3nG&>1~m`6=bZLIHhYP4LAXh z2Uv+W#fJuQO3NNW_SqqQJ4B2&XYMX9{(|v7%{25uDt=K0bF0hO@$&s$An15Qtk6qe zp!qdmfSr?-qo$QZLi;(1iKF*G-Nru?Y&Fo<^#=x>X$PqX>4x@D{oMf?LF0ab-cSr> zM^}|TKlEcAb;WlujpdTH#<`!n&o1e30?<{_7y^li~oW)p6bJy?8&T(cI8KqFz#o;7$ zTTq5$S0sU2Jm{mE;(rjtZ}WOF6V&4W1?ZkDqI5F4%l1yvv0OZn*td}MSm4l#Is6A< zNoNh(=V=`W6S|=rY7Sb+swXHwU&N;GUfXHkYOFX0~Xn2aNi%_pLaWeKljS@?&FR}BzQ za45w17ANQgs1TcA^8~2S+!^B&AV#~RQX}Jx^l~-e(w8g8xKiU3kTWV$E{8x)>Es!D zq#S^^H;&;Px7e-Ou37qMIN`2hS8+Ne8EY1&Cob6^f$h+B-XZk;MKp$yRkPZl29;?B t{;;P$)lf8Qn)X!FbN|z3JD;*+*>pC8-!y(xxaenqevKoBo>K=8{~wjWk`e#_ literal 0 HcmV?d00001 diff --git a/vllm/model_executor/parallel_utils/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/parallel_utils/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81faf3efb09fa11cb8e9e680069136088309fe15 GIT binary patch literal 1635 zcmbVMOK%%D5GJ|1@=CVc)CHWl4}j=RK)Om1AP2)RYP1d79(;&`q7YCB>m@BQ=I$ya z*N7AeXscX%?lnH<7ZuoRPx%W)({zUNOYY-P0-WV@_M30MVcBlC2(*uW`E_{EAmlfk zTyMDh8FYIHMi4Cigv$)}YpGI^LWTsm(!j-Y=I-EuH9GNapo;e^@EZ{Ytg z{mL0z(4VLXz+*Ul@MU2uBe@%;ju%xfP3lU+mE}U6DyxpPpz<9hv?msiBBft%lZ-O8G-F0)j3wL0pKCvH?0 z_mfZWb**yplX04zq(f=Dr&{OTycALoi47%Y;Yci z(#lORITo@;wpd78Z%q@1e`7F!MbF?C44h|d!KnYfPk=Kz{qBEp#?wM@BVA>R7Ki33 z^&FQ4KTUNdlMeNeObCDW)GSR5)4G9y2zd*}7WZ30ObrI~ux>brwKVl6&^!P>oN}rx zY<)W`3#W!vS=oUd0|A37}+#&P8QJAqaG7EM< zQVTW@UGO9Xd4;n;u%Cl@G-Kvt7cIp-i@qQWx}ZHWi-52weFSRZJw(7$z68rtT^2*G zoaH*TsGl5y0N6>(vx5vBm>~$VaCY}{$rIgM zjffApR=^S9f9zwo-v4W?L8!9;xXW;S(eNF>)LQI!!2Afw{C;gtXB1QVEurKo6JE<$ zr>!g89agjCULE;(ZYr?3H|}z5{|9ou?;kBF#Y(#X}^6j8sPQ*`uVSsq?@2W z_z!Tm!K+388}Dw>TP$Q-G-PH67CQ7{rwMg68LMJwkWaIX1|JL$yx&-6q7GEy>hP#6 xwZC}n_xQ9KU%K%gyjhi&`gJ3<1bbit8@SW literal 0 HcmV?d00001 diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py new file mode 100644 index 0000000..ce69f91 --- /dev/null +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -0,0 +1,213 @@ +from collections import namedtuple +from typing import Any, Dict, List, Optional, Union + +import torch +from torch.distributed import ProcessGroup + +from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tensor_model_parallel_group, + is_cupy_nccl_enabled_for_all_reduce, +) +from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce +from ixformer.contrib.torch.extension.ixformer_torch.distributed import ( + create_ixformer_group_from_pg, +) +from ixformer.distributed import all_reduce +_IXFORMER_TENSOR_MODEL_PARALLEL_GROUP = None + + +def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: + """All-reduce the input tensor across model parallel group. + + NOTE: This operation will be applied in-place on the input tensor if + disable_custom_all_reduce is set to True. Otherwise, this operation may or + may not be applied in place depending on whether custom all reduce is + invoked for a particular tensor, which further depends on the tensor size + and GPU topology. + + TLDR: always assume this function modifies its input, but use the return + value as the output. + """ + # Bypass the function if we are using only 1 GPU. + if get_tensor_model_parallel_world_size() == 1: + return input_ + global _IXFORMER_TENSOR_MODEL_PARALLEL_GROUP + if _IXFORMER_TENSOR_MODEL_PARALLEL_GROUP is None: + _IXFORMER_TENSOR_MODEL_PARALLEL_GROUP = create_ixformer_group_from_pg(get_tensor_model_parallel_group()) + out = custom_all_reduce(input_) + if out is not None: + return out + if is_cupy_nccl_enabled_for_all_reduce(): + # TODO: support multiple parallel groups. + cupy_utils.all_reduce(input_) + else: + all_reduce(input_,group=_IXFORMER_TENSOR_MODEL_PARALLEL_GROUP,async_op=True) + # TODO use our all reduce.. + # torch.distributed.all_reduce(input_, + # group=get_tensor_model_parallel_group()) + return input_ + + +def tensor_model_parallel_all_gather(input_: torch.Tensor, + dim: int = -1) -> torch.Tensor: + """All-gather the input tensor across model parallel group.""" + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + # Allocate output tensor. + output_tensor = torch.empty((world_size, ) + input_size, + dtype=input_.dtype, + device=input_.device) + # All-gather. + torch.distributed.all_gather_into_tensor( + output_tensor, input_, group=get_tensor_model_parallel_group()) + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape(input_size[:dim] + + (world_size * input_size[dim], ) + + input_size[dim + 1:]) + return output_tensor + + +def tensor_model_parallel_gather(input_: torch.Tensor, + dst: int = 0, + dim: int = -1) -> torch.Tensor: + """Gather the input tensor across model parallel group. + + NOTE: We assume that the input tensor is on the same device across + all the ranks. + """ + world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + # Allocate output tensor. + if get_tensor_model_parallel_rank() == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None + # Gather. + torch.distributed.gather(input_, + gather_list, + dst=dst, + group=get_tensor_model_parallel_group()) + if get_tensor_model_parallel_rank() == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None + return output_tensor + + +def broadcast(input_: torch.Tensor, + src: int = 0, + group: Optional[ProcessGroup] = None): + """Broadcast the input tensor.""" + group = group or torch.distributed.group.WORLD + ranks = torch.distributed.get_process_group_ranks(group) + assert src in ranks, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + world_size = torch.distributed.get_world_size(group=group) + if world_size == 1: + return input_ + # Broadcast. + torch.distributed.broadcast(input_, src=src, group=group) + return input_ + + +def broadcast_object_list(obj_list: List[Any], + src: int = 0, + group: Optional[ProcessGroup] = None): + """Broadcast the input object list.""" + group = group or torch.distributed.group.WORLD + ranks = torch.distributed.get_process_group_ranks(group) + assert src in ranks, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + world_size = torch.distributed.get_world_size(group=group) + if world_size == 1: + return obj_list + # Broadcast. + torch.distributed.broadcast_object_list(obj_list, src=src, group=group) + return obj_list + + +TensorMetadata = namedtuple("TensorMetadata", ["dtype", "size"]) + + +def broadcast_tensor_dict( + tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, + src: int = 0, + group: Optional[ProcessGroup] = None, +) -> Dict[Any, Union[torch.Tensor, Any]]: + """Broadcast the input tensor dictionary.""" + group = group or torch.distributed.group.WORLD + ranks = torch.distributed.get_process_group_ranks(group) + assert src in ranks, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + world_size = torch.distributed.get_world_size(group=group) + if world_size == 1: + return tensor_dict + + rank = torch.distributed.get_rank() + if rank == src: + assert isinstance( + tensor_dict, + dict), (f"Expecting a dictionary, got {type(tensor_dict)}") + metadata_list = [] + for key, value in tensor_dict.items(): + if isinstance(value, torch.Tensor): + assert value.is_cuda, ( + f"Tensor {key}: {value} is not on cuda. Currently we only " + f"support broadcasting tensors on cuda.") + metadata_list.append( + (key, TensorMetadata(value.dtype, value.size()))) + else: + metadata_list.append((key, value)) + torch.distributed.broadcast_object_list([metadata_list], + src=src, + group=group) + for key, value in metadata_list: + if isinstance(value, TensorMetadata): + tensor = tensor_dict[key] + torch.distributed.broadcast(tensor, src=src) + else: + recv_metadata_list = [None] + torch.distributed.broadcast_object_list(recv_metadata_list, + src=src, + group=group) + metadata_list = recv_metadata_list[0] + tensor_dict = {} + async_handles = [] + for key, value in metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device="cuda") + async_handle = torch.distributed.broadcast(tensor, + src=src, + async_op=True, + group=group) + async_handles.append(async_handle) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + for async_handle in async_handles: + async_handle.wait() + return tensor_dict diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py new file mode 100644 index 0000000..f8cffc0 --- /dev/null +++ b/vllm/model_executor/parallel_utils/cupy_utils.py @@ -0,0 +1,130 @@ +"""CuPy utilities for all-reduce. + +We use CuPy all-reduce instead of torch.distributed.all_reduce when capturing +CUDA graphs, because torch.distributed.all_reduce causes errors when capturing +CUDA graphs. + +NOTE: We use CuPy 12.3 since CuPy 13.0 does not support Python 3.8. +TODO: Remove this file when torch.distributed.all_reduce is fixed. +""" +import contextlib + +import torch +from torch.distributed import ReduceOp + +try: + import cupy + from cupy.cuda import nccl + from cupyx.distributed import NCCLBackend +except ImportError as e: + cupy = e + nccl = None + + class NCCLBackend: + ... + + +_OP_MAPPING = { + ReduceOp.SUM: "sum", + ReduceOp.PRODUCT: "prod", + ReduceOp.MIN: "min", + ReduceOp.MAX: "max", +} + + +class NCCLBackendWithBFloat16(NCCLBackend): + # This is enough to add bfloat16 support for most operations, + # but broadcast will fail (will require changes in compiled + # cupy code). + def _get_nccl_dtype_and_count(self, array, count=None): + nccl_dtype, count = super()._get_nccl_dtype_and_count(array, count) + torch_dtype = getattr(array, "_torch_dtype", None) + if torch_dtype is torch.bfloat16: + nccl_dtype = nccl.NCCL_BFLOAT16 + return nccl_dtype, count + + def barrier(self) -> None: + raise RuntimeError( + "Currently, CuPy NCCL barrier is not supported since the TCP " + "store is immediately stopped after the initialization.") + + +_NCCL_BACKEND = None +_WORLD_SIZE = 0 + + +def is_initialized() -> bool: + """Returns whether the NCCL backend is initialized.""" + return _NCCL_BACKEND is not None + + +@contextlib.contextmanager +def set_cupy_stream(stream: torch.cuda.Stream): + """Set the cuda stream for communication""" + cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream, + stream.device_index) + with cupy_stream: + yield + + +def init_process_group(world_size: int, rank: int, host: str, + port: int) -> None: + """Initializes the CuPy NCCL backend. + + # TODO: handle NCCL timeouts. + """ + assert not is_initialized() + + if isinstance(cupy, Exception): + raise ImportError( + "NCCLBackend is not available. Please install cupy.") from cupy + + # TODO(woosuk): Create TP and PP process groups for CuPy. + global _NCCL_BACKEND + global _WORLD_SIZE + assert world_size > 0, f"{world_size=} should be a positive integer" + assert 0 <= rank < world_size, ( + f"{rank=} should be a integer between [0, {world_size})") + + cupy.cuda.runtime.setDevice(torch.cuda.current_device()) + _NCCL_BACKEND = NCCLBackendWithBFloat16(world_size, rank, host, port) + _WORLD_SIZE = world_size + + # Stop the TCP store to prevent the deadlock issues at termination time. + # FIXME(woosuk): This is hacky. Find a more robust solution. + if rank == 0 and hasattr(_NCCL_BACKEND, "_store"): + _NCCL_BACKEND._store.stop() + + +def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: + """All-reduces the input tensor across the process group.""" + assert input_.is_cuda, f"{input_} should be a cuda tensor" + # Hack to support bfloat16 + torch_dtype = input_.dtype + if torch_dtype is torch.bfloat16: + # We need to view as float16, otherwise + # cupy will fail. This will not change + # the underlying data. + input_ = input_.view(torch.float16) + cupy_input = cupy.asarray(input_) + cupy_input._torch_dtype = torch_dtype # pylint: disable=protected-access + _NCCL_BACKEND.all_reduce(in_array=cupy_input, + out_array=cupy_input, + op=_OP_MAPPING[op]) + + +def destroy_process_group() -> None: + """Destroys the NCCL backend.""" + global _NCCL_BACKEND + global _WORLD_SIZE + _NCCL_BACKEND = None + _WORLD_SIZE = 0 + + +def get_world_size() -> int: + """Returns the world size.""" + return _WORLD_SIZE + + +def get_nccl_backend(): + return _NCCL_BACKEND diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py new file mode 100644 index 0000000..9b5a5c3 --- /dev/null +++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py @@ -0,0 +1,247 @@ +from contextlib import contextmanager +from typing import Optional + +import torch +import torch.distributed as dist + +from vllm.logger import init_logger +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank) + +try: + from vllm._C import custom_ar + # import pynvml avoid import error +except ImportError: + # For AMD GPUs + custom_ar = None + pynvml = None + +logger = init_logger(__name__) + +_CA_HANDLE = None +_IS_CAPTURING = False +_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8] + + +def init_custom_ar() -> None: + global _CA_HANDLE + if _CA_HANDLE is not None: + return + rank = get_tensor_model_parallel_rank() + world_size = get_tensor_model_parallel_world_size() + if world_size == 1: + # No need to initialize custom allreduce for single GPU case. + return + + if world_size not in _SUPPORTED_WORLD_SIZES: + logger.warn( + "Custom allreduce is disabled due to an unsupported world size: " + "%d. Supported world sizes: %s. To silence this warning, specify" + "disable_custom_all_reduce=True explicitly.", world_size, + str(_SUPPORTED_WORLD_SIZES)) + return + if not _can_p2p(rank, world_size): + logger.warn( + "Custom allreduce is disabled because your platform lacks GPU P2P" + " capability. To silence this warning, specify" + "disable_custom_all_reduce=True explicitly.") + return + _CA_HANDLE = CustomAllreduce(rank, world_size) + + +def begin_capture() -> None: + global _IS_CAPTURING + _IS_CAPTURING = True + + +def end_capture() -> None: + global _IS_CAPTURING + _IS_CAPTURING = False + + +def is_capturing() -> bool: + return _IS_CAPTURING and _CA_HANDLE is not None + + +def get_handle() -> Optional["CustomAllreduce"]: + return _CA_HANDLE + + +def is_initialized() -> bool: + return _CA_HANDLE is not None + + +@contextmanager +def capture(): + try: + begin_capture() + yield + finally: + end_capture() + handle = get_handle() + if handle is not None: + handle.register_graph_buffers() + + +def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]: + ca_handle = get_handle() + # when custom allreduce is disabled, this will be None + if ca_handle is None: + return + if is_capturing(): + if torch.cuda.is_current_stream_capturing(): + if ca_handle.should_custom_ar(input): + return ca_handle.all_reduce_reg(input) + else: + if ca_handle.should_custom_ar(input): + # if warm up, mimic the allocation pattern + # since custom allreduce is out-of-place + return torch.empty_like(input) + else: + # note: outside of cuda graph context, + # custom allreduce incurs a cost of cudaMemcpy, which should + # be small(<=1% of overall latency) compared to the performance + # gains of using custom kernels + if ca_handle.should_custom_ar(input): + return ca_handle.all_reduce_unreg(input) + + +@contextmanager +def _nvml(): + try: + pynvml.nvmlInit() + yield + finally: + pynvml.nvmlShutdown() + + +# query if the set of gpus are fully connected by nvlink (1 hop) +@_nvml() +def _is_full_nvlink(rank, world_size): + handle = pynvml.nvmlDeviceGetHandleByIndex(rank) + for i in range(world_size): + if i != rank: + try: + link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i) + if not link_state: + return False + except pynvml.NVMLError as error: + logger.info( + f"NVLink detection failed with message \"{str(error)}\". " + "This is normal if your machine has no NVLink equipped") + return False + return True + + +def _can_p2p(rank: int, world_size: int) -> bool: + for i in range(world_size): + if i == rank: + continue + if not torch.cuda.can_device_access_peer(rank, i): + return False + return True + + +class CustomAllreduce: + + # max_size: max supported allreduce size + def __init__(self, rank, world_size, max_size=8192 * 1024) -> None: + self.max_size = max_size + self.world_size = world_size + self.full_nvlink = False + self._ptr = None + self.buffer = None + if not custom_ar.is_init(): + custom_ar.init_cumtom_ar() + # TODO aling + """ + # buffers memory are owned by this Python class and passed to C++ + # meta data composes of two parts: meta data for synchronization + # (256 bytes) and a temporary buffer for storing intermediate + # allreduce results. + self.meta = torch.zeros(custom_ar.meta_size() + max_size, + dtype=torch.uint8, + device="cuda") + # This is a pre-registered IPC buffer. In eager mode, input tensors + # are first copied into this buffer before allreduce is performed + self.buffer = torch.empty(max_size, dtype=torch.uint8, device="cuda") + # This is a buffer for storing the tuples of pointers pointing to + # IPC buffers from all ranks. Each registered tuple has size of + # 8*world_size bytes where world_size is at most 8. Allocating 8MB + # is enough for 131072 such tuples. The largest model I've seen only + # needs less than 10000 of registered tuples. + self.rank_data = torch.empty(8 * 1024 * 1024, + dtype=torch.uint8, + device="cuda") + self.max_size = max_size + self.world_size = world_size + handles, offsets = self._get_ipc_meta(self.meta) + self.full_nvlink = _is_full_nvlink(rank, world_size) + self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data, + handles, offsets, rank, + self.full_nvlink) + self.fast_cond = self.full_nvlink or world_size <= 2 + self.register_buffer(self.buffer) + """ + #TODO align + """ + def _get_ipc_meta(self, inp: torch.Tensor): + data = inp.untyped_storage()._share_cuda_() + shard_data = ( + data[1], # ipc handle to base ptr + data[3], # offset of base ptr + ) + return self._gather_ipc_meta(shard_data) + + def _gather_ipc_meta(self, shard_data): + all_data = [None] * self.world_size + dist.all_gather_object(all_data, shard_data) + + handles = [] + offsets = [] + for i in range(len(all_data)): + handles.append(all_data[i][0]) + offsets.append(all_data[i][1]) + return handles, offsets + + def register_buffer(self, inp: torch.Tensor): + handles, offsets = self._get_ipc_meta(inp) + custom_ar.register_buffer(self._ptr, inp, handles, offsets) + + def register_graph_buffers(self): + handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr) + handles, offsets = self._gather_ipc_meta((bytes(handle), offset)) + logger.info("Registering %d cuda graph addresses", len(offset)) + custom_ar.register_graph_buffers(self._ptr, handles, offsets) + """ + + def should_custom_ar(self, inp: torch.Tensor): + return custom_ar.should_custom_ar(inp, self.max_size, self.world_size, + self.full_nvlink) + + # all reduce, assuming inp tensor is IPC registered with register_buffer, + # or, in the context of cuda graphs, register_graph_buffers + def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty_like(inp) + custom_ar.all_reduce_reg(self._ptr, inp, out) + return out + + # all reduce, assuming inp tensor is NOT IPC registered + def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty_like(inp) + custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out) + return out + + def close(self): + custom_ar.dispose(self._ptr) + # TODO align + """ + if self._ptr: + custom_ar.dispose(self._ptr) + self._ptr = 0 + """ + + def __del__(self): + self.close() diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py new file mode 100644 index 0000000..c821936 --- /dev/null +++ b/vllm/model_executor/parallel_utils/parallel_state.py @@ -0,0 +1,245 @@ +# Copyright 2023 The vLLM team. +# Adapted from +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +"""Tensor and pipeline parallel groups.""" +import contextlib + +import torch + +from vllm.model_executor.parallel_utils import cupy_utils + +# Tensor model parallel group that the current rank belongs to. +_TENSOR_MODEL_PARALLEL_GROUP = None +# Pipeline model parallel group that the current rank belongs to. +_PIPELINE_MODEL_PARALLEL_GROUP = None + +# A list of global ranks for each pipeline group to ease calculation of the +# source rank when broadcasting from the first or last pipeline stage. +_PIPELINE_GLOBAL_RANKS = None + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, +) -> None: + """ + Initialize model parallel groups. + + Arguments: + tensor_model_parallel_size: number of GPUs used for tensor model + parallelism. + pipeline_model_parallel_size: number of GPUs used for pipeline model + parallelism. + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: + 4 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 pipeline model-parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size: int = torch.distributed.get_world_size() + + if (world_size != + tensor_model_parallel_size * pipeline_model_parallel_size): + raise RuntimeError( + f"world_size ({world_size}) is not equal to " + f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " + f"pipeline_model_parallel_size ({pipeline_model_parallel_size})") + + num_tensor_model_parallel_groups: int = (world_size // + tensor_model_parallel_size) + num_pipeline_model_parallel_groups: int = (world_size // + pipeline_model_parallel_size) + rank = torch.distributed.get_rank() + + # Build the tensor model-parallel groups. + global _TENSOR_MODEL_PARALLEL_GROUP + assert _TENSOR_MODEL_PARALLEL_GROUP is None, ( + "tensor model parallel group is already initialized") + for i in range(num_tensor_model_parallel_groups): + ranks = range(i * tensor_model_parallel_size, + (i + 1) * tensor_model_parallel_size) + group = torch.distributed.new_group(ranks) + if rank in ranks: + _TENSOR_MODEL_PARALLEL_GROUP = group + + # Build the pipeline model-parallel groups. + global _PIPELINE_MODEL_PARALLEL_GROUP + global _PIPELINE_GLOBAL_RANKS + assert _PIPELINE_MODEL_PARALLEL_GROUP is None, ( + "pipeline model parallel group is already initialized") + for i in range(num_pipeline_model_parallel_groups): + ranks = range(i, world_size, num_pipeline_model_parallel_groups) + group = torch.distributed.new_group(ranks) + if rank in ranks: + _PIPELINE_MODEL_PARALLEL_GROUP = group + _PIPELINE_GLOBAL_RANKS = ranks + + +def ensure_model_parallel_initialized( + tensor_model_parallel_size: int, + pipeline_model_parallel_size: int, +) -> None: + """Helper to initialize model parallel groups if they are not initialized, + or ensure tensor-parallel and pipeline-parallel sizes are equal to expected + values if the model parallel groups are initialized. + """ + if not model_parallel_is_initialized(): + initialize_model_parallel(tensor_model_parallel_size, + pipeline_model_parallel_size) + return + + assert ( + get_tensor_model_parallel_world_size() == tensor_model_parallel_size + ), ("tensor parallel group already initialized, but of unexpected size: " + f"{get_tensor_model_parallel_world_size()=} vs. " + f"{tensor_model_parallel_size=}") + assert (get_pipeline_model_parallel_world_size( + ) == pipeline_model_parallel_size), ( + "pipeline parallel group already initialized, but of unexpected size: " + f"{get_pipeline_model_parallel_world_size()=} vs. " + f"{pipeline_model_parallel_size=}") + + +def model_parallel_is_initialized(): + """Check if tensor and pipeline parallel groups are initialized.""" + return (_TENSOR_MODEL_PARALLEL_GROUP is not None + and _PIPELINE_MODEL_PARALLEL_GROUP is not None) + + +def get_tensor_model_parallel_group(): + """Get the tensor model parallel group the caller rank belongs to.""" + assert _TENSOR_MODEL_PARALLEL_GROUP is not None, ( + "tensor model parallel group is not initialized") + return _TENSOR_MODEL_PARALLEL_GROUP + + +def get_pipeline_model_parallel_group(): + """Get the pipeline model parallel group the caller rank belongs to.""" + assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, ( + "pipeline model parallel group is not initialized") + return _PIPELINE_MODEL_PARALLEL_GROUP + + +def get_tensor_model_parallel_world_size(): + """Return world size for the tensor model parallel group.""" + return torch.distributed.get_world_size( + group=get_tensor_model_parallel_group()) + + +def get_pipeline_model_parallel_world_size(): + """Return world size for the pipeline model parallel group.""" + return torch.distributed.get_world_size( + group=get_pipeline_model_parallel_group()) + + +def get_tensor_model_parallel_rank(): + """Return my rank for the tensor model parallel group.""" + return torch.distributed.get_rank(group=get_tensor_model_parallel_group()) + + +def get_pipeline_model_parallel_rank(): + """Return my rank for the pipeline model parallel group.""" + return torch.distributed.get_rank( + group=get_pipeline_model_parallel_group()) + + +def get_tensor_model_parallel_src_rank(): + """Calculate the global rank corresponding to the first local rank + in the tensor model parallel group.""" + global_rank = torch.distributed.get_rank() + local_world_size = get_tensor_model_parallel_world_size() + return (global_rank // local_world_size) * local_world_size + + +def get_pipeline_model_parallel_first_rank(): + """Return the global rank of the first process in the pipeline for the + current tensor parallel group""" + assert _PIPELINE_GLOBAL_RANKS is not None, ( + "Pipeline parallel group is not initialized") + return _PIPELINE_GLOBAL_RANKS[0] + + +def get_pipeline_model_parallel_last_rank(): + """Return the global rank of the last process in the pipeline for the + current tensor parallel group""" + assert _PIPELINE_GLOBAL_RANKS is not None, ( + "Pipeline parallel group is not initialized") + last_rank_local = get_pipeline_model_parallel_world_size() - 1 + return _PIPELINE_GLOBAL_RANKS[last_rank_local] + + +def get_pipeline_model_parallel_next_rank(): + """Return the global rank that follows the caller in the pipeline""" + assert _PIPELINE_GLOBAL_RANKS is not None, ( + "Pipeline parallel group is not initialized") + rank_in_pipeline = get_pipeline_model_parallel_rank() + world_size = get_pipeline_model_parallel_world_size() + return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size] + + +def get_pipeline_model_parallel_prev_rank(): + """Return the global rank that precedes the caller in the pipeline""" + assert _PIPELINE_GLOBAL_RANKS is not None, ( + "Pipeline parallel group is not initialized") + rank_in_pipeline = get_pipeline_model_parallel_rank() + world_size = get_pipeline_model_parallel_world_size() + return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size] + + +def destroy_model_parallel(): + """Set the groups to none and destroy them.""" + global _TENSOR_MODEL_PARALLEL_GROUP + if _TENSOR_MODEL_PARALLEL_GROUP: + torch.distributed.destroy_process_group(_TENSOR_MODEL_PARALLEL_GROUP) + _TENSOR_MODEL_PARALLEL_GROUP = None + global _PIPELINE_MODEL_PARALLEL_GROUP + if _PIPELINE_MODEL_PARALLEL_GROUP: + torch.distributed.destroy_process_group(_PIPELINE_MODEL_PARALLEL_GROUP) + _PIPELINE_MODEL_PARALLEL_GROUP = None + global _PIPELINE_GLOBAL_RANKS + _PIPELINE_GLOBAL_RANKS = None + + # Destroy the cupy states if any. + cupy_utils.destroy_process_group() + + +# Whether to use cupy for nccl all reduce. +# We use cupy for all reduce when using CUDA graph, because torch.distributed +# is not well supported by CUDA graph. +_ENABLE_CUPY_FOR_ALL_REDUCE = False + + +@contextlib.contextmanager +def with_cupy_nccl_for_all_reduce(): + """use CuPy nccl instead of torch.distributed for all reduce""" + tp_size = get_tensor_model_parallel_world_size() + if tp_size == 1: + # No-op. + # NOTE(woosuk): We don't initialize CuPy when tp_size is 1. + yield + else: + global _ENABLE_CUPY_FOR_ALL_REDUCE + old = _ENABLE_CUPY_FOR_ALL_REDUCE + _ENABLE_CUPY_FOR_ALL_REDUCE = True + + stream = torch.cuda.current_stream() + with cupy_utils.set_cupy_stream(stream): + yield + _ENABLE_CUPY_FOR_ALL_REDUCE = old + + +def is_cupy_nccl_enabled_for_all_reduce(): + """check if CuPy nccl is enabled for all reduce""" + global _ENABLE_CUPY_FOR_ALL_REDUCE + return _ENABLE_CUPY_FOR_ALL_REDUCE diff --git a/vllm/model_executor/parallel_utils/utils.py b/vllm/model_executor/parallel_utils/utils.py new file mode 100644 index 0000000..0cd420c --- /dev/null +++ b/vllm/model_executor/parallel_utils/utils.py @@ -0,0 +1,48 @@ +# Copyright 2023 The vLLM team. +# Adapted from +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +from typing import Sequence + +import torch + + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, "{} is not divisible by {}".format( + numerator, denominator) + + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> Sequence[torch.Tensor]: + """ Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = divide(tensor.size()[last_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # NOTE: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py new file mode 100644 index 0000000..7deb808 --- /dev/null +++ b/vllm/model_executor/sampling_metadata.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch + +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sequence import SequenceData +from vllm.utils import in_wsl, is_neuron + +_SAMPLING_EPS = 1e-5 + + +class SamplingMetadata: + """Metadata for input sequences. Used in sampler. + + Args: + seq_groups: List of (seq_ids, sampling_params). + seq_data: Seq_id -> SequenceData. + prompt_lens: Lengths of prompts. + selected_token_indices: Token indices selected for sampling. + categorized_sample_indices: SamplingType -> token indices to sample. + generators: List of torch.Generators to use for seeded sampling + perform_sampling: Whether to perform sampling. This option is used to + make the sampling only happens in the driver worker, and disable + sampling in other worker processes. + """ + + def __init__( + self, + seq_groups: Optional[List[Tuple[List[int], SamplingParams]]], + seq_data: Optional[Dict[int, SequenceData]], + prompt_lens: Optional[List[int]], + selected_token_indices: torch.Tensor, + categorized_sample_indices: Optional[Dict[SamplingType, torch.Tensor]], + generators: Optional[List[torch.Generator]] = None, + perform_sampling: bool = True, + ) -> None: + self.seq_groups = seq_groups + self.seq_data = seq_data + self.prompt_lens = prompt_lens + self.selected_token_indices = selected_token_indices + self.categorized_sample_indices = categorized_sample_indices + self.generators = generators + self.perform_sampling = perform_sampling + + self.num_prompts = len(prompt_lens) if prompt_lens is not None else 0 + + def __repr__(self) -> str: + return ( + "SamplingMetadata(" + f"seq_groups={self.seq_groups}, " + f"seq_data={self.seq_data}, " + f"prompt_lens={self.prompt_lens}, " + f"selected_token_indices={self.selected_token_indices}, " + f"categorized_sample_indices={self.categorized_sample_indices}), " + f"perform_sampling={self.perform_sampling})") + + +@dataclass +class SamplingTensors: + """Tensors for sampling.""" + + temperatures: torch.Tensor + top_ps: torch.Tensor + top_ks: torch.Tensor + min_ps: torch.Tensor + presence_penalties: torch.Tensor + frequency_penalties: torch.Tensor + repetition_penalties: torch.Tensor + prompt_tokens: torch.Tensor + output_tokens: torch.Tensor + + @classmethod + def from_sampling_metadata( + cls, sampling_metadata: "SamplingMetadata", vocab_size: int, + device: torch.device, + dtype: torch.dtype) -> Tuple["SamplingTensors", bool, bool, bool]: + prompt_tokens: List[List[int]] = [] + output_tokens: List[List[int]] = [] + top_ks: List[int] = [] + temperatures: List[float] = [] + top_ps: List[float] = [] + min_ps: List[float] = [] + presence_penalties: List[float] = [] + frequency_penalties: List[float] = [] + repetition_penalties: List[float] = [] + do_penalties = False + do_top_p_top_k = False + do_min_p = False + for i, seq_group in enumerate(sampling_metadata.seq_groups): + seq_ids, sampling_params = seq_group + temperature = sampling_params.temperature + p = sampling_params.presence_penalty + f = sampling_params.frequency_penalty + r = sampling_params.repetition_penalty + top_p = sampling_params.top_p + min_p = sampling_params.min_p + # k should not be greater than the vocab size. + top_k = min(sampling_params.top_k, vocab_size) + top_k = vocab_size if top_k == -1 else top_k + if temperature < _SAMPLING_EPS: + # NOTE: Zero temperature means deterministic sampling + # (i.e., greedy sampling or beam search). + # Set the temperature to 1 to avoid division by zero. + temperature = 1.0 + if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS + or top_k != vocab_size): + do_top_p_top_k = True + if not do_min_p and min_p > _SAMPLING_EPS: + do_min_p = True + if not do_penalties and (abs(p) >= _SAMPLING_EPS + or abs(f) >= _SAMPLING_EPS + or abs(r - 1.0) >= _SAMPLING_EPS): + do_penalties = True + if (i < sampling_metadata.num_prompts + and sampling_params.prompt_logprobs is not None): + # For tokens in the prompt that we only need to get their logprobs + prompt_len = sampling_metadata.prompt_lens[i] + temperatures += [temperature] * (prompt_len - 1) + top_ps += [top_p] * (prompt_len - 1) + top_ks += [top_k] * (prompt_len - 1) + min_ps += [min_p] * (prompt_len - 1) + presence_penalties += [0] * (prompt_len - 1) + frequency_penalties += [0] * (prompt_len - 1) + repetition_penalties += [1] * (prompt_len - 1) + prompt_tokens.extend([] for _ in range(prompt_len - 1)) + output_tokens.extend([] for _ in range(prompt_len - 1)) + for seq_id in seq_ids: + seq_data = sampling_metadata.seq_data[seq_id] + prompt_tokens.append(seq_data.prompt_token_ids) + output_tokens.append(seq_data.output_token_ids) + temperatures += [temperature] * len(seq_ids) + top_ps += [top_p] * len(seq_ids) + top_ks += [top_k] * len(seq_ids) + min_ps += [min_p] * len(seq_ids) + presence_penalties += [p] * len(seq_ids) + frequency_penalties += [f] * len(seq_ids) + repetition_penalties += [r] * len(seq_ids) + + sampling_tensors = SamplingTensors.from_lists( + temperatures, top_ps, top_ks, min_ps, presence_penalties, + frequency_penalties, repetition_penalties, prompt_tokens, + output_tokens, vocab_size, device, dtype) + return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) + + @classmethod + def from_lists(cls, temperatures: List[float], top_ps: List[float], + top_ks: List[int], min_ps: List[float], + presence_penalties: List[float], + frequency_penalties: List[float], + repetition_penalties: List[float], + prompt_tokens: List[List[int]], + output_tokens: List[List[int]], vocab_size: int, + device: torch.device, + dtype: torch.dtype) -> "SamplingTensors": + # Note that the performance will be very bad without + # pinned memory. + pin_memory = not in_wsl() and not is_neuron() + prompt_max_len = max(len(tokens) for tokens in prompt_tokens) + prompt_padded_tokens = [ + tokens + [vocab_size] * (prompt_max_len - len(tokens)) + for tokens in prompt_tokens + ] + output_max_len = max(len(tokens) for tokens in output_tokens) + output_padded_tokens = [ + tokens + [vocab_size] * (output_max_len - len(tokens)) + for tokens in output_tokens + ] + + temperatures_t = torch.tensor( + temperatures, + device="cpu", + dtype=dtype, + pin_memory=pin_memory, + ) + top_ps_t = torch.tensor( + top_ps, + device="cpu", + dtype=dtype, + pin_memory=pin_memory, + ) + min_ps_t = torch.tensor( + min_ps, + device="cpu", + dtype=dtype, + pin_memory=pin_memory, + ) + presence_penalties_t = torch.tensor( + presence_penalties, + device="cpu", + dtype=dtype, + pin_memory=pin_memory, + ) + frequency_penalties_t = torch.tensor( + frequency_penalties, + device="cpu", + dtype=dtype, + pin_memory=pin_memory, + ) + repetition_penalties_t = torch.tensor( + repetition_penalties, + device="cpu", + dtype=dtype, + pin_memory=pin_memory, + ) + top_ks_t = torch.tensor( + top_ks, + device="cpu", + dtype=torch.int, + pin_memory=pin_memory, + ) + prompt_tensor = torch.tensor( + prompt_padded_tokens, + device="cpu", + dtype=torch.long, + pin_memory=pin_memory, + ) + output_tensor = torch.tensor( + output_padded_tokens, + device="cpu", + dtype=torch.long, + pin_memory=pin_memory, + ) + # Because the memory is pinned, we can do non-blocking + # transfer to device. + return cls( + temperatures=temperatures_t.to(device=device, non_blocking=True), + top_ps=top_ps_t.to(device=device, non_blocking=True), + top_ks=top_ks_t.to(device=device, non_blocking=True), + min_ps=min_ps_t.to(device=device, non_blocking=True), + presence_penalties=presence_penalties_t.to(device=device, + non_blocking=True), + frequency_penalties=frequency_penalties_t.to(device=device, + non_blocking=True), + repetition_penalties=repetition_penalties_t.to(device=device, + non_blocking=True), + prompt_tokens=prompt_tensor.to(device=device, non_blocking=True), + output_tokens=output_tensor.to(device=device, non_blocking=True), + ) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py new file mode 100644 index 0000000..0113e3e --- /dev/null +++ b/vllm/model_executor/utils.py @@ -0,0 +1,52 @@ +"""Utils for model executor.""" +import random +import importlib +from typing import Any, Dict, Optional + +import numpy as np +import torch + +from vllm.config import DeviceConfig, ModelConfig + +DEVICE_TO_MODEL_LOADER_MAP = { + "cuda": "model_loader", + "neuron": "neuron_model_loader", +} + + +def set_random_seed(seed: int) -> None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + +def set_weight_attrs( + weight: torch.Tensor, + weight_attrs: Optional[Dict[str, Any]], +): + """Set attributes on a weight tensor. + + This method is used to set attributes on a weight tensor. This method + will not overwrite existing attributes. + + Args: + weight: The weight tensor. + weight_attrs: A dictionary of attributes to set on the weight tensor. + """ + if weight_attrs is None: + return + for key, value in weight_attrs.items(): + assert not hasattr( + weight, key), (f"Overwriting existing tensor attribute: {key}") + setattr(weight, key, value) + + +def get_model(model_config: ModelConfig, device_config: DeviceConfig, + **kwargs) -> torch.nn.Module: + model_loader_module = DEVICE_TO_MODEL_LOADER_MAP[device_config.device_type] + imported_model_loader = importlib.import_module( + f"vllm.model_executor.{model_loader_module}") + get_model_fn = imported_model_loader.get_model + return get_model_fn(model_config, device_config, **kwargs) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py new file mode 100644 index 0000000..bf90fd3 --- /dev/null +++ b/vllm/model_executor/weight_utils.py @@ -0,0 +1,300 @@ +"""Utilities for downloading and initializing model weights.""" +import filelock +import glob +import fnmatch +import json +import os +from collections import defaultdict +from typing import Any, Iterator, List, Optional, Tuple + +from huggingface_hub import snapshot_download, HfFileSystem +import numpy as np +from safetensors.torch import load_file, save_file, safe_open +import torch +from tqdm.auto import tqdm + +from vllm.config import ModelConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization import (get_quantization_config, + QuantizationConfig) + +logger = init_logger(__name__) + + +class Disabledtqdm(tqdm): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, disable=True) + + +def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): + lock_dir = cache_dir if cache_dir is not None else "/tmp" + lock_file_name = model_name_or_path.replace("/", "-") + ".lock" + lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) + return lock + + +def _shared_pointers(tensors): + ptrs = defaultdict(list) + for k, v in tensors.items(): + ptrs[v.data_ptr()].append(k) + failing = [] + for _, names in ptrs.items(): + if len(names) > 1: + failing.append(names) + return failing + + +def convert_bin_to_safetensor_file( + pt_filename: str, + sf_filename: str, +) -> None: + loaded = torch.load(pt_filename, map_location="cpu") + if "state_dict" in loaded: + loaded = loaded["state_dict"] + shared = _shared_pointers(loaded) + for shared_weights in shared: + for name in shared_weights[1:]: + loaded.pop(name) + + # For tensors to be contiguous + loaded = {k: v.contiguous() for k, v in loaded.items()} + + dirname = os.path.dirname(sf_filename) + os.makedirs(dirname, exist_ok=True) + save_file(loaded, sf_filename, metadata={"format": "pt"}) + + # check file size + sf_size = os.stat(sf_filename).st_size + pt_size = os.stat(pt_filename).st_size + if (sf_size - pt_size) / pt_size > 0.01: + raise RuntimeError(f"""The file size different is more than 1%: + - {sf_filename}: {sf_size} + - {pt_filename}: {pt_size} + """) + + # check if the tensors are the same + reloaded = load_file(sf_filename) + for k in loaded: + pt_tensor = loaded[k] + sf_tensor = reloaded[k] + if not torch.equal(pt_tensor, sf_tensor): + raise RuntimeError(f"The output tensors do not match for key {k}") + + +# TODO(woosuk): Move this to other place. +def get_quant_config(model_config: ModelConfig) -> QuantizationConfig: + quant_cls = get_quantization_config(model_config.quantization) + # Read the quantization config from the HF model config, if available. + hf_quant_config = getattr(model_config.hf_config, "quantization_config", + None) + if hf_quant_config is not None: + return quant_cls.from_config(hf_quant_config) + model_name_or_path = model_config.model + is_local = os.path.isdir(model_name_or_path) + if not is_local: + # Download the config files. + with get_lock(model_name_or_path, model_config.download_dir): + hf_folder = snapshot_download(model_name_or_path, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=model_config.download_dir, + tqdm_class=Disabledtqdm) + else: + hf_folder = model_name_or_path + config_files = glob.glob(os.path.join(hf_folder, "*.json")) + + quant_config_files = [ + f for f in config_files if any( + f.endswith(x) for x in quant_cls.get_config_filenames()) + ] + if len(quant_config_files) == 0: + raise ValueError( + f"Cannot find the config file for {model_config.quantization}") + if len(quant_config_files) > 1: + raise ValueError( + f"Found multiple config files for {model_config.quantization}: " + f"{quant_config_files}") + + quant_config_file = quant_config_files[0] + with open(quant_config_file, "r") as f: + config = json.load(f) + return quant_cls.from_config(config) + + +def prepare_hf_model_weights( + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + fall_back_to_pt: bool = True, + revision: Optional[str] = None, +) -> Tuple[str, List[str], bool]: + # Download model weights from huggingface. + is_local = os.path.isdir(model_name_or_path) + use_safetensors = False + # Some quantized models use .pt files for storing the weights. + if load_format == "auto": + allow_patterns = ["*.safetensors", "*.bin"] + elif load_format == "safetensors": + use_safetensors = True + allow_patterns = ["*.safetensors"] + elif load_format == "pt": + allow_patterns = ["*.pt"] + elif load_format == "npcache": + allow_patterns = ["*.bin"] + else: + raise ValueError(f"Unknown load_format: {load_format}") + + if fall_back_to_pt: + allow_patterns += ["*.pt"] + + if not is_local: + # Before we download we look at that is available: + fs = HfFileSystem() + file_list = fs.ls(model_name_or_path, detail=False, revision=revision) + + # depending on what is available we download different things + for pattern in allow_patterns: + matching = fnmatch.filter(file_list, pattern) + if len(matching) > 0: + allow_patterns = [pattern] + break + + logger.info(f"Using model weights format {allow_patterns}") + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model_name_or_path, cache_dir): + hf_folder = snapshot_download(model_name_or_path, + allow_patterns=allow_patterns, + cache_dir=cache_dir, + tqdm_class=Disabledtqdm, + revision=revision) + else: + hf_folder = model_name_or_path + hf_weights_files: List[str] = [] + for pattern in allow_patterns: + hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + if len(hf_weights_files) > 0: + if pattern == "*.safetensors": + use_safetensors = True + break + if not use_safetensors: + # Exclude files that are not needed for inference. + # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 + blacklist = [ + "training_args.bin", + "optimizer.bin", + "optimizer.pt", + "scheduler.pt", + "scaler.pt", + ] + hf_weights_files = [ + f for f in hf_weights_files + if not any(f.endswith(x) for x in blacklist) + ] + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`") + + return hf_folder, hf_weights_files, use_safetensors + + +def hf_model_weights_iterator( + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + fall_back_to_pt: Optional[bool] = True, +) -> Iterator[Tuple[str, torch.Tensor]]: + hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights( + model_name_or_path, + cache_dir=cache_dir, + load_format=load_format, + fall_back_to_pt=fall_back_to_pt, + revision=revision) + + if load_format == "npcache": + # Currently np_cache only support *.bin checkpoints + assert use_safetensors is False + + # Convert the model weights from torch tensors to numpy arrays for + # faster loading. + np_folder = os.path.join(hf_folder, "np") + os.makedirs(np_folder, exist_ok=True) + weight_names_file = os.path.join(np_folder, "weight_names.json") + # Use file lock to prevent multiple processes from + # dumping the same model weights to numpy at the same time. + with get_lock(model_name_or_path, cache_dir): + if not os.path.exists(weight_names_file): + weight_names = [] + for bin_file in hf_weights_files: + state = torch.load(bin_file, map_location="cpu") + for name, param in state.items(): + param_path = os.path.join(np_folder, name) + with open(param_path, "wb") as f: + np.save(f, param.cpu().detach().numpy()) + weight_names.append(name) + with open(weight_names_file, "w") as f: + json.dump(weight_names, f) + + with open(weight_names_file, "r") as f: + weight_names = json.load(f) + + for name in weight_names: + param_path = os.path.join(np_folder, name) + with open(param_path, "rb") as f: + param = np.load(f) + yield name, torch.from_numpy(param) + elif use_safetensors: + for st_file in hf_weights_files: + with safe_open(st_file, framework="pt") as f: + for name in f.keys(): # noqa: SIM118 + param = f.get_tensor(name) + yield name, param + else: + for bin_file in hf_weights_files: + state = torch.load(bin_file, map_location="cpu") + for name, param in state.items(): + yield name, param + del state + torch.cuda.empty_cache() + + +def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: + """convert PySafeSlice object from safetensors to torch.Tensor + + PySafeSlice object supports indexing, which is done before loading the + actual tensor and can reduce the amount of memory being read into the + memory. However, it does not support more advanced functionalities + like `.view()` or `.t()`. Therefore, if we need to modify the loaded + tensor with these more complicated operators, we need to convert to + tensor first. + """ + if not isinstance(x, torch.Tensor): + x = x[:] + return x + + +def default_weight_loader(param: torch.Tensor, + loaded_weight: torch.Tensor) -> None: + """Default weight loader.""" + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight) + + +def initialize_dummy_weights( + model: torch.nn.Module, + low: float = -1e-3, + high: float = 1e-3, +) -> None: + """Initialize model weights with random values. + + The model weights must be randomly initialized for accurate performance + measurements. Additionally, the model weights should not cause NaNs in the + forward pass. We empirically found that initializing the weights with + values between -1e-3 and 1e-3 works well for most models. + """ + for param in model.state_dict().values(): + if torch.is_floating_point(param): + param.data.uniform_(low, high) \ No newline at end of file diff --git a/vllm/outputs.py b/vllm/outputs.py new file mode 100644 index 0000000..a6de2a5 --- /dev/null +++ b/vllm/outputs.py @@ -0,0 +1,141 @@ +from typing import List, Optional +import time + +from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup, + SequenceStatus, RequestMetrics) +from vllm.lora.request import LoRARequest + + +class CompletionOutput: + """The output data of one completion output of a request. + + Args: + index: The index of the output in the request. + text: The generated output text. + token_ids: The token IDs of the generated output text. + cumulative_logprob: The cumulative log probability of the generated + output text. + logprobs: The log probabilities of the top probability words at each + position if the logprobs are requested. + finish_reason: The reason why the sequence is finished. + lora_request: The LoRA request that was used to generate the output. + """ + + def __init__( + self, + index: int, + text: str, + token_ids: List[int], + cumulative_logprob: float, + logprobs: Optional[SampleLogprobs], + finish_reason: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + ) -> None: + self.index = index + self.text = text + self.token_ids = token_ids + self.cumulative_logprob = cumulative_logprob + self.logprobs = logprobs + self.finish_reason = finish_reason + self.lora_request = lora_request + + def finished(self) -> bool: + return self.finish_reason is not None + + def __repr__(self) -> str: + return (f"CompletionOutput(index={self.index}, " + f"text={self.text!r}, " + f"token_ids={self.token_ids}, " + f"cumulative_logprob={self.cumulative_logprob}, " + f"logprobs={self.logprobs}, " + f"finish_reason={self.finish_reason})") + + +class RequestOutput: + """The output data of a request to the LLM. + + Args: + request_id: The unique ID of the request. + prompt: The prompt string of the request. + prompt_token_ids: The token IDs of the prompt. + prompt_logprobs: The log probabilities to return per prompt token. + outputs: The output sequences of the request. + finished: Whether the whole request is finished. + metrics: Metrics associated with the request. + lora_request: The LoRA request that was used to generate the output. + """ + + def __init__( + self, + request_id: str, + prompt: str, + prompt_token_ids: List[int], + prompt_logprobs: Optional[PromptLogprobs], + outputs: List[CompletionOutput], + finished: bool, + metrics: Optional[RequestMetrics] = None, + lora_request: Optional[LoRARequest] = None, + ) -> None: + self.request_id = request_id + self.prompt = prompt + self.prompt_token_ids = prompt_token_ids + self.prompt_logprobs = prompt_logprobs + self.outputs = outputs + self.finished = finished + self.metrics = metrics + self.lora_request = lora_request + + @classmethod + def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": + # Get the top-n sequences. + n = seq_group.sampling_params.n + seqs = seq_group.get_seqs() + if seq_group.sampling_params.use_beam_search: + sorting_key = lambda seq: seq.get_beam_search_score( + seq_group.sampling_params.length_penalty) + else: + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] + + # Create the outputs. + outputs: List[CompletionOutput] = [] + for seq in top_n_seqs: + logprobs = seq.output_logprobs + if seq_group.sampling_params.logprobs is None: + # NOTE: We need to take care of this case because the sequence + # always has the logprobs of the sampled tokens even if the + # logprobs are not requested. + logprobs = None + finshed_reason = SequenceStatus.get_finished_reason(seq.status) + output = CompletionOutput(seqs.index(seq), seq.output_text, + seq.get_output_token_ids(), + seq.get_cumulative_logprob(), logprobs, + finshed_reason) + outputs.append(output) + + # Every sequence in the sequence group should have the same prompt. + prompt = seq_group.prompt + prompt_token_ids = seq_group.prompt_token_ids + prompt_logprobs = seq_group.prompt_logprobs + finished = seq_group.is_finished() + finished_time = time.time() if finished else None + seq_group.set_finished_time(finished_time) + return cls(seq_group.request_id, + prompt, + prompt_token_ids, + prompt_logprobs, + outputs, + finished, + seq_group.metrics, + lora_request=seq_group.lora_request) + + def __repr__(self) -> str: + return (f"RequestOutput(request_id={self.request_id}, " + f"prompt={self.prompt!r}, " + f"prompt_token_ids={self.prompt_token_ids}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"outputs={self.outputs}, " + f"finished={self.finished}, " + f"metrics={self.metrics}, " + f"lora_request={self.lora_request})") diff --git a/vllm/prefix.py b/vllm/prefix.py new file mode 100644 index 0000000..5b6e8e4 --- /dev/null +++ b/vllm/prefix.py @@ -0,0 +1,87 @@ +from typing import Dict, List, Sequence, Tuple, Optional + +from vllm.block import BlockTable + + +class Prefix: + """Data and states associated with a prefix of prompt tokens for multiple + sequence groups. + + NOTE: This feature is experimental and may be replaced with automatic + prefix caching in the future. + + Args: + token_ids: The token ids of the prefix. + block_size: The block size of the executed model. + """ + + def __init__( + self, + token_ids: Sequence[int], + block_size: int, + ) -> None: + self.token_ids = tuple(token_ids) + self.block_size = block_size + self.length = len(token_ids) + self.hash = hash(token_ids) + assert self.length % block_size == 0 + self.block_table: Optional[BlockTable] = None + self.computed = False + + @property + def allocated(self) -> bool: + return self.block_table is not None + + def get_num_blocks(self) -> int: + return self.length // self.block_size + + def get_block_numbers(self) -> List[int]: + return [block.block_number for block in self.block_table] + + def get_length(self) -> int: + return self.length + + def __hash__(self) -> int: + return self.hash + + def set_block_table(self, block_table: BlockTable) -> None: + self.block_table = block_table.copy() + + +class PrefixPool: + """Manages all the prompt prefixes. + + NOTE: This feature is experimental and may be replaced with automatic + prefix caching in the future. + + Args: + block_size: The block size of the executed model. + + Attributes: + prefixes: A list of all the prefixes. + block_size: The block size of the executed model. + """ + + def __init__( + self, + block_size: int, + ) -> None: + # TODO(zhuohan): Add a capacity limit to the prefix pool. + self.prefixes: Dict[int, Prefix] = {} + self.block_size = block_size + + def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]: + new_length = len(token_ids) // self.block_size * self.block_size + return tuple(token_ids[:new_length]) + + def add_or_get_prefix(self, token_ids: Sequence[int], + lora_int_id: int) -> Optional[Prefix]: + token_ids = self._truncate_token_ids(token_ids) + if len(token_ids) == 0: + # Prefix is empty. + return None + prefix = Prefix(token_ids, self.block_size) + prefix_hash = hash((prefix, lora_int_id)) + if prefix_hash not in self.prefixes: + self.prefixes[prefix_hash] = prefix + return self.prefixes[prefix_hash] diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py new file mode 100644 index 0000000..8103f3c --- /dev/null +++ b/vllm/sampling_params.py @@ -0,0 +1,279 @@ +"""Sampling parameters for text generation.""" +import copy +from enum import IntEnum +from functools import cached_property +from typing import Callable, List, Optional, Union + +import torch + +_SAMPLING_EPS = 1e-5 + + +class SamplingType(IntEnum): + GREEDY = 0 + RANDOM = 1 + RANDOM_SEED = 2 + BEAM = 3 + + +LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor] +"""LogitsProcessor is a function that takes a list of previously generated +tokens and a tensor of the logits for the next token, and returns a modified +tensor of logits to sample from.""" + + +class SamplingParams: + """Sampling parameters for text generation. + + Overall, we follow the sampling parameters from the OpenAI text completion + API (https://platform.openai.com/docs/api-reference/completions/create). + In addition, we support beam search, which is not supported by OpenAI. + + Args: + n: Number of output sequences to return for the given prompt. + best_of: Number of output sequences that are generated from the prompt. + From these `best_of` sequences, the top `n` sequences are returned. + `best_of` must be greater than or equal to `n`. This is treated as + the beam width when `use_beam_search` is True. By default, `best_of` + is set to `n`. + presence_penalty: Float that penalizes new tokens based on whether they + appear in the generated text so far. Values > 0 encourage the model + to use new tokens, while values < 0 encourage the model to repeat + tokens. + frequency_penalty: Float that penalizes new tokens based on their + frequency in the generated text so far. Values > 0 encourage the + model to use new tokens, while values < 0 encourage the model to + repeat tokens. + repetition_penalty: Float that penalizes new tokens based on whether + they appear in the prompt and the generated text so far. Values > 1 + encourage the model to use new tokens, while values < 1 encourage + the model to repeat tokens. + temperature: Float that controls the randomness of the sampling. Lower + values make the model more deterministic, while higher values make + the model more random. Zero means greedy sampling. + top_p: Float that controls the cumulative probability of the top tokens + to consider. Must be in (0, 1]. Set to 1 to consider all tokens. + top_k: Integer that controls the number of top tokens to consider. Set + to -1 to consider all tokens. + min_p: Float that represents the minimum probability for a token to be + considered, relative to the probability of the most likely token. + Must be in [0, 1]. Set to 0 to disable this. + seed: Random seed to use for the generation. + use_beam_search: Whether to use beam search instead of sampling. + length_penalty: Float that penalizes sequences based on their length. + Used in beam search. + early_stopping: Controls the stopping condition for beam search. It + accepts the following values: `True`, where the generation stops as + soon as there are `best_of` complete candidates; `False`, where an + heuristic is applied and the generation stops when is it very + unlikely to find better candidates; `"never"`, where the beam search + procedure only stops when there cannot be better candidates + (canonical beam search algorithm). + stop: List of strings that stop the generation when they are generated. + The returned output will not contain the stop strings. + stop_token_ids: List of tokens that stop the generation when they are + generated. The returned output will contain the stop tokens unless + the stop tokens are special tokens. + include_stop_str_in_output: Whether to include the stop strings in output + text. Defaults to False. + ignore_eos: Whether to ignore the EOS token and continue generating + tokens after the EOS token is generated. + max_tokens: Maximum number of tokens to generate per output sequence. + logprobs: Number of log probabilities to return per output token. + Note that the implementation follows the OpenAI API: The return + result includes the log probabilities on the `logprobs` most likely + tokens, as well the chosen tokens. The API will always return the + log probability of the sampled token, so there may be up to + `logprobs+1` elements in the response. + prompt_logprobs: Number of log probabilities to return per prompt token. + skip_special_tokens: Whether to skip special tokens in the output. + spaces_between_special_tokens: Whether to add spaces between special + tokens in the output. Defaults to True. + logits_processors: List of functions that modify logits based on + previously generated tokens. + """ + + def __init__( + self, + n: int = 1, + best_of: Optional[int] = None, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repetition_penalty: float = 1.0, + temperature: float = 1.0, + top_p: float = 1.0, + top_k: int = -1, + min_p: float = 0.0, + seed: Optional[int] = None, + use_beam_search: bool = False, + length_penalty: float = 1.0, + early_stopping: Union[bool, str] = False, + stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, + include_stop_str_in_output: bool = False, + ignore_eos: bool = False, + max_tokens: Optional[int] = 16, + logprobs: Optional[int] = None, + prompt_logprobs: Optional[int] = None, + skip_special_tokens: bool = True, + spaces_between_special_tokens: bool = True, + logits_processors: Optional[List[LogitsProcessor]] = None, + ) -> None: + self.n = n + self.best_of = best_of if best_of is not None else n + self.presence_penalty = presence_penalty + self.frequency_penalty = frequency_penalty + self.repetition_penalty = repetition_penalty + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.min_p = min_p + self.seed = seed + self.use_beam_search = use_beam_search + self.length_penalty = length_penalty + self.early_stopping = early_stopping + if stop is None: + self.stop = [] + elif isinstance(stop, str): + self.stop = [stop] + else: + self.stop = list(stop) + if stop_token_ids is None: + self.stop_token_ids = [] + else: + self.stop_token_ids = list(stop_token_ids) + self.ignore_eos = ignore_eos + self.max_tokens = max_tokens + self.logprobs = logprobs + self.prompt_logprobs = prompt_logprobs + self.skip_special_tokens = skip_special_tokens + self.spaces_between_special_tokens = spaces_between_special_tokens + self.logits_processors = logits_processors + self.include_stop_str_in_output = include_stop_str_in_output + self._verify_args() + if self.use_beam_search: + self._verify_beam_search() + else: + self._verify_non_beam_search() + if self.temperature < _SAMPLING_EPS: + # Zero temperature means greedy sampling. + self.top_p = 1.0 + self.top_k = -1 + self.min_p = 0.0 + self._verify_greedy_sampling() + + def _verify_args(self) -> None: + if self.n < 1: + raise ValueError(f"n must be at least 1, got {self.n}.") + if self.best_of < self.n: + raise ValueError(f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not -2.0 <= self.presence_penalty <= 2.0: + raise ValueError("presence_penalty must be in [-2, 2], got " + f"{self.presence_penalty}.") + if not -2.0 <= self.frequency_penalty <= 2.0: + raise ValueError("frequency_penalty must be in [-2, 2], got " + f"{self.frequency_penalty}.") + if not 0.0 < self.repetition_penalty <= 2.0: + raise ValueError("repetition_penalty must be in (0, 2], got " + f"{self.repetition_penalty}.") + if self.temperature < 0.0: + raise ValueError( + f"temperature must be non-negative, got {self.temperature}.") + if not 0.0 < self.top_p <= 1.0: + raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") + if self.top_k < -1 or self.top_k == 0: + raise ValueError(f"top_k must be -1 (disable), or at least 1, " + f"got {self.top_k}.") + if not 0.0 <= self.min_p <= 1.0: + raise ValueError("min_p must be in [0, 1], got " + f"{self.min_p}.") + if self.max_tokens is not None and self.max_tokens < 1: + raise ValueError( + f"max_tokens must be at least 1, got {self.max_tokens}.") + if self.logprobs is not None and self.logprobs < 0: + raise ValueError( + f"logprobs must be non-negative, got {self.logprobs}.") + if self.prompt_logprobs is not None and self.prompt_logprobs < 0: + raise ValueError(f"prompt_logprobs must be non-negative, got " + f"{self.prompt_logprobs}.") + + def _verify_beam_search(self) -> None: + if self.best_of == 1: + raise ValueError("best_of must be greater than 1 when using beam " + f"search. Got {self.best_of}.") + if self.temperature > _SAMPLING_EPS: + raise ValueError("temperature must be 0 when using beam search.") + if self.top_p < 1.0 - _SAMPLING_EPS: + raise ValueError("top_p must be 1 when using beam search.") + if self.top_k != -1: + raise ValueError("top_k must be -1 when using beam search.") + if self.early_stopping not in [True, False, "never"]: + raise ValueError( + f"early_stopping must be True, False, or 'never', " + f"got {self.early_stopping}.") + + def _verify_non_beam_search(self) -> None: + if self.early_stopping is not False: + raise ValueError("early_stopping is not effective and must be " + "False when not using beam search.") + if (self.length_penalty < 1.0 - _SAMPLING_EPS + or self.length_penalty > 1.0 + _SAMPLING_EPS): + raise ValueError( + "length_penalty is not effective and must be the " + "default value of 1.0 when not using beam search.") + + def _verify_greedy_sampling(self) -> None: + if self.best_of > 1: + raise ValueError("best_of must be 1 when using greedy sampling." + f"Got {self.best_of}.") + + @cached_property + def sampling_type(self) -> SamplingType: + if self.use_beam_search: + return SamplingType.BEAM + if self.temperature < _SAMPLING_EPS: + return SamplingType.GREEDY + if self.seed is not None: + return SamplingType.RANDOM_SEED + return SamplingType.RANDOM + + def clone(self) -> "SamplingParams": + """Deep copy excluding LogitsProcessor objects. + + LogitsProcessor objects are excluded because they may contain an + arbitrary, nontrivial amount of data. + See https://github.com/vllm-project/vllm/issues/3087 + """ + + logit_processor_refs = None if self.logits_processors is None else { + id(lp): lp + for lp in self.logits_processors + } + return copy.deepcopy(self, memo=logit_processor_refs) + + def __repr__(self) -> str: + return ( + f"SamplingParams(n={self.n}, " + f"best_of={self.best_of}, " + f"presence_penalty={self.presence_penalty}, " + f"frequency_penalty={self.frequency_penalty}, " + f"repetition_penalty={self.repetition_penalty}, " + f"temperature={self.temperature}, " + f"top_p={self.top_p}, " + f"top_k={self.top_k}, " + f"min_p={self.min_p}, " + f"seed={self.seed}, " + f"use_beam_search={self.use_beam_search}, " + f"length_penalty={self.length_penalty}, " + f"early_stopping={self.early_stopping}, " + f"stop={self.stop}, " + f"stop_token_ids={self.stop_token_ids}, " + f"include_stop_str_in_output={self.include_stop_str_in_output}, " + f"ignore_eos={self.ignore_eos}, " + f"max_tokens={self.max_tokens}, " + f"logprobs={self.logprobs}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"skip_special_tokens={self.skip_special_tokens}, " + "spaces_between_special_tokens=" + f"{self.spaces_between_special_tokens})") diff --git a/vllm/sequence.py b/vllm/sequence.py new file mode 100644 index 0000000..040e975 --- /dev/null +++ b/vllm/sequence.py @@ -0,0 +1,497 @@ +"""Sequence and its related classes.""" +import copy +import enum +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +from vllm.block import LogicalTokenBlock +from vllm.prefix import Prefix +from vllm.sampling_params import SamplingParams +from vllm.lora.request import LoRARequest + +PromptLogprobs = List[Optional[Dict[int, float]]] +SampleLogprobs = List[Dict[int, float]] + + +class SequenceStatus(enum.Enum): + """Status of a sequence.""" + WAITING = enum.auto() + RUNNING = enum.auto() + SWAPPED = enum.auto() + FINISHED_STOPPED = enum.auto() + FINISHED_LENGTH_CAPPED = enum.auto() + FINISHED_ABORTED = enum.auto() + FINISHED_IGNORED = enum.auto() + + @staticmethod + def is_finished(status: "SequenceStatus") -> bool: + return status in [ + SequenceStatus.FINISHED_STOPPED, + SequenceStatus.FINISHED_LENGTH_CAPPED, + SequenceStatus.FINISHED_ABORTED, + SequenceStatus.FINISHED_IGNORED, + ] + + @staticmethod + def get_finished_reason(status: "SequenceStatus") -> Union[str, None]: + if status == SequenceStatus.FINISHED_STOPPED: + finish_reason = "stop" + elif status == SequenceStatus.FINISHED_LENGTH_CAPPED: + finish_reason = "length" + elif status == SequenceStatus.FINISHED_ABORTED: + finish_reason = "abort" + elif status == SequenceStatus.FINISHED_IGNORED: + # The ignored sequences are the sequences whose prompt lengths + # are longer than the model's length cap. Therefore, the stop + # reason should also be "length" as in OpenAI API. + finish_reason = "length" + else: + finish_reason = None + return finish_reason + + +@dataclass +class RequestMetrics: + """Metrics associated with a request. + + Args: + arrival_time: The time when the request arrived. + first_scheduled_time: The time when the request was first scheduled. + first_token_time: The time when the first token was generated. + time_in_queue: The time the request spent in the queue. + finished_time: The time when the request was finished. + """ + arrival_time: float + last_token_time: float + first_scheduled_time: Optional[float] + first_token_time: Optional[float] + time_in_queue: Optional[float] + finished_time: Optional[float] = None + + +class SequenceData: + """Data associated with a sequence. + + Args: + prompt_token_ids: The token IDs of the prompt. + + Attributes: + prompt_token_ids: The token IDs of the prompt. + output_token_ids: The token IDs of the output. + cumulative_logprob: The cumulative log probability of the output. + """ + + def __init__( + self, + prompt_token_ids: List[int], + ) -> None: + self.prompt_token_ids = prompt_token_ids + self.output_token_ids: List[int] = [] + self.cumulative_logprob = 0.0 + + def append_token_id(self, token_id: int, logprob: float) -> None: + self.output_token_ids.append(token_id) + self.cumulative_logprob += logprob + + def get_len(self) -> int: + return len(self.output_token_ids) + len(self.prompt_token_ids) + + def get_prompt_len(self) -> int: + return len(self.prompt_token_ids) + + def get_output_len(self) -> int: + return len(self.output_token_ids) + + def get_token_ids(self) -> List[int]: + return self.prompt_token_ids + self.output_token_ids + + def get_last_token_id(self) -> int: + if not self.output_token_ids: + return self.prompt_token_ids[-1] + return self.output_token_ids[-1] + + def __repr__(self) -> str: + return (f"SequenceData(" + f"prompt_token_ids={self.prompt_token_ids}, " + f"output_token_ids={self.output_token_ids}, " + f"cumulative_logprob={self.cumulative_logprob})") + + +class Sequence: + """Stores the data, status, and block information of a sequence. + + Args: + seq_id: The ID of the sequence. + prompt: The prompt of the sequence. + prompt_token_ids: The token IDs of the prompt. + block_size: The block size of the sequence. Should be the same as the + block size used by the block manager and cache engine. + lora_request: LoRA request. + """ + + def __init__( + self, + seq_id: int, + prompt: str, + prompt_token_ids: List[int], + block_size: int, + lora_request: Optional[LoRARequest] = None, + ) -> None: + self.seq_id = seq_id + self.prompt = prompt + self.block_size = block_size + self.lora_request = lora_request + + self.data = SequenceData(prompt_token_ids) + self.output_logprobs: SampleLogprobs = [] + self.output_text = "" + + self.logical_token_blocks: List[LogicalTokenBlock] = [] + # Initialize the logical token blocks with the prompt token ids. + self._append_tokens_to_blocks(prompt_token_ids) + self.status = SequenceStatus.WAITING + + # Used for incremental detokenization + self.prefix_offset = 0 + self.read_offset = 0 + # Input + output tokens + self.tokens: Optional[List[str]] = None + + @property + def lora_int_id(self) -> int: + return self.lora_request.lora_int_id if self.lora_request else 0 + + def _append_logical_block(self) -> None: + block = LogicalTokenBlock( + block_number=len(self.logical_token_blocks), + block_size=self.block_size, + ) + self.logical_token_blocks.append(block) + + def _append_tokens_to_blocks(self, token_ids: List[int]) -> None: + cursor = 0 + while cursor < len(token_ids): + if not self.logical_token_blocks: + self._append_logical_block() + + last_block = self.logical_token_blocks[-1] + if last_block.is_full(): + self._append_logical_block() + last_block = self.logical_token_blocks[-1] + + num_empty_slots = last_block.get_num_empty_slots() + last_block.append_tokens(token_ids[cursor:cursor + + num_empty_slots]) + cursor += num_empty_slots + + def append_token_id( + self, + token_id: int, + logprobs: Dict[int, float], + ) -> None: + assert token_id in logprobs + self._append_tokens_to_blocks([token_id]) + self.output_logprobs.append(logprobs) + self.data.append_token_id(token_id, logprobs[token_id]) + + def get_len(self) -> int: + return self.data.get_len() + + def get_prompt_len(self) -> int: + return self.data.get_prompt_len() + + def get_output_len(self) -> int: + return self.data.get_output_len() + + def get_token_ids(self) -> List[int]: + return self.data.get_token_ids() + + def get_last_token_id(self) -> int: + return self.data.get_last_token_id() + + def get_output_token_ids(self) -> List[int]: + return self.data.output_token_ids + + def get_cumulative_logprob(self) -> float: + return self.data.cumulative_logprob + + def get_beam_search_score(self, + length_penalty: float = 1.0, + seq_len: Optional[int] = None, + eos_token_id: Optional[int] = None) -> float: + """Calculate the beam search score with length penalty. + + Adapted from + + https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938 + """ + if seq_len is None: + seq_len = self.get_len() + # NOTE: HF implementation does not count the EOS token + # towards the length, we align with that here for testing. + if (eos_token_id is not None + and self.get_last_token_id() == eos_token_id): + seq_len -= 1 + return self.get_cumulative_logprob() / (seq_len**length_penalty) + + def is_finished(self) -> bool: + return SequenceStatus.is_finished(self.status) + + def fork(self, new_seq_id: int) -> "Sequence": + new_seq = copy.deepcopy(self) + new_seq.seq_id = new_seq_id + return new_seq + + def __repr__(self) -> str: + return (f"Sequence(seq_id={self.seq_id}, " + f"status={self.status.name}, " + f"num_blocks={len(self.logical_token_blocks)})") + + +@dataclass +class SequenceGroupState: + """Mutable state tied to a specific sequence group""" + + # torch.Generator used in seeded sampling + generator: Optional = None + + +class SequenceGroup: + """A group of sequences that are generated from the same prompt. + + Args: + request_id: The ID of the request. + seqs: The list of sequences. + sampling_params: The sampling parameters used to generate the outputs. + arrival_time: The arrival time of the request. + lora_request: LoRA request. + prefix: The prefix of the prompt of the sequence group. + """ + + def __init__( + self, + request_id: str, + seqs: List[Sequence], + sampling_params: SamplingParams, + arrival_time: float, + lora_request: Optional[LoRARequest] = None, + prefix: Optional[Prefix] = None, + ) -> None: + self.request_id = request_id + self.seqs_dict = {seq.seq_id: seq for seq in seqs} + self.sampling_params = sampling_params + self.metrics = RequestMetrics(arrival_time=arrival_time, + last_token_time=arrival_time, + first_scheduled_time=None, + first_token_time=None, + time_in_queue=None) + self.lora_request = lora_request + self.prefix: Optional[Prefix] = prefix + self.prompt_logprobs: Optional[PromptLogprobs] = None + self.state = SequenceGroupState() + + @property + def prompt(self) -> str: + # All sequences in the group should have the same prompt. + # We use the prompt of an arbitrary sequence. + return next(iter(self.seqs_dict.values())).prompt + + @property + def prompt_token_ids(self) -> List[int]: + # All sequences in the group should have the same prompt. + # We use the prompt of an arbitrary sequence. + return next(iter(self.seqs_dict.values())).data.prompt_token_ids + + @property + def lora_int_id(self) -> int: + return self.lora_request.lora_int_id if self.lora_request else 0 + + def get_last_latency(self, now: float) -> float: + """Gets last token latency for Request level timings.""" + latency = now - self.metrics.last_token_time + self.metrics.last_token_time = now + return latency + + def maybe_set_first_token_time(self, time: float) -> None: + """Sets the first token time for Request level timings.""" + if self.metrics.first_token_time is None: + self.metrics.first_token_time = time + + def maybe_set_first_scheduled_time(self, time: float) -> None: + """Sets the first scheduled time and time in queue for Request level timings.""" + if self.metrics.first_scheduled_time is None: + self.metrics.first_scheduled_time = time + self.metrics.time_in_queue = time - self.metrics.arrival_time + + def set_finished_time(self, time: Optional[float]) -> None: + """Sets the finished time for Request level timings.""" + self.metrics.finished_time = time + + def get_max_num_running_seqs(self) -> int: + """The maximum number of sequences running in parallel in the remaining + lifetime of the request.""" + if self.sampling_params.use_beam_search: + # For beam search, maximally there will always be `best_of` beam + # candidates running in the future. + return self.sampling_params.best_of + else: + if self.sampling_params.best_of > self.num_seqs(): + # At prompt stage, the sequence group is not yet filled up + # and only have one sequence running. However, in the + # generation stage, we will have `best_of` sequences running. + return self.sampling_params.best_of + # At sampling stages, return the number of actual sequences + # that are not finished yet. + return self.num_unfinished_seqs() + + def get_seqs( + self, + status: Optional[SequenceStatus] = None, + ) -> List[Sequence]: + if status is None: + return list(self.seqs_dict.values()) + else: + return [ + seq for seq in self.seqs_dict.values() if seq.status == status + ] + + def get_unfinished_seqs(self) -> List[Sequence]: + return [ + seq for seq in self.seqs_dict.values() if not seq.is_finished() + ] + + def get_finished_seqs(self) -> List[Sequence]: + return [seq for seq in self.seqs_dict.values() if seq.is_finished()] + + def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: + return len(self.get_seqs(status)) + + def num_unfinished_seqs(self) -> int: + return len(self.get_unfinished_seqs()) + + def num_finished_seqs(self) -> int: + return len(self.get_finished_seqs()) + + def find(self, seq_id: int) -> Sequence: + if seq_id not in self.seqs_dict: + raise ValueError(f"Sequence {seq_id} not found.") + return self.seqs_dict[seq_id] + + def add(self, seq: Sequence) -> None: + if seq.seq_id in self.seqs_dict: + raise ValueError(f"Sequence {seq.seq_id} already exists.") + self.seqs_dict[seq.seq_id] = seq + + def remove(self, seq_id: int) -> None: + if seq_id not in self.seqs_dict: + raise ValueError(f"Sequence {seq_id} not found.") + del self.seqs_dict[seq_id] + + def is_finished(self) -> bool: + return all(seq.is_finished() for seq in self.get_seqs()) + + def __repr__(self) -> str: + return (f"SequenceGroup(request_id={self.request_id}, " + f"sampling_params={self.sampling_params}, " + f"num_seqs={len(self.seqs_dict)})") + + +class SequenceGroupMetadata: + """Metadata for a sequence group. Used to create `InputMetadata`. + + Args: + request_id: The ID of the request. + is_prompt: Whether the request is at prompt stage. + seq_data: The sequence data. (Seq id -> sequence data) + sampling_params: The sampling parameters used to generate the outputs. + block_tables: The block tables. (Seq id -> list of physical block + numbers) + state: Internal state tied to this sequence group. + lora_request: LoRA request. + prefix: The prefix of the prompt of the sequence group. + """ + + def __init__( + self, + request_id: str, + is_prompt: bool, + seq_data: Dict[int, SequenceData], + sampling_params: SamplingParams, + block_tables: Dict[int, List[int]], + lora_request: Optional[LoRARequest] = None, + prefix: Optional[Prefix] = None, + state: Optional[SequenceGroupState] = None, + ) -> None: + self.request_id = request_id + self.is_prompt = is_prompt + self.seq_data = seq_data + self.sampling_params = sampling_params + self.block_tables = block_tables + self.lora_request = lora_request + self.prefix = prefix + self.state = SequenceGroupState() if state is None else state + + @property + def lora_int_id(self) -> int: + return self.lora_request.lora_int_id if self.lora_request else 0 + + +class SequenceOutput: + """The model output associated with a sequence. + + Args: + parent_seq_id: The ID of the parent sequence (for forking in beam + search). + output_token: The output token ID. + logprobs: The logprobs of the output token. + (Token id -> logP(x_i+1 | x_0, ..., x_i)) + """ + + def __init__( + self, + parent_seq_id: int, + output_token: int, + logprobs: Dict[int, float], + ) -> None: + self.parent_seq_id = parent_seq_id + self.output_token = output_token + self.logprobs = logprobs + + def __repr__(self) -> str: + return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " + f"output_token={self.output_token}, " + f"logprobs={self.logprobs})") + + def __eq__(self, other: object) -> bool: + if not isinstance(other, SequenceOutput): + raise NotImplementedError() + return (self.parent_seq_id == other.parent_seq_id + and self.output_token == other.output_token + and self.logprobs == other.logprobs) + + +class SequenceGroupOutput: + """The model output associated with a sequence group.""" + + def __init__( + self, + samples: List[SequenceOutput], + prompt_logprobs: Optional[PromptLogprobs], + ) -> None: + self.samples = samples + self.prompt_logprobs = prompt_logprobs + + def __repr__(self) -> str: + return (f"SequenceGroupOutput(samples={self.samples}, " + f"prompt_logprobs={self.prompt_logprobs})") + + def __eq__(self, other: object) -> bool: + if not isinstance(other, SequenceGroupOutput): + raise NotImplementedError() + return (self.samples == other.samples + and self.prompt_logprobs == other.prompt_logprobs) + + +# For each sequence group, we generate a list of SequenceOutput object, +# each of which contains one possible candidate for the next token. +SamplerOutput = List[SequenceGroupOutput] diff --git a/vllm/test_utils.py b/vllm/test_utils.py new file mode 100644 index 0000000..75bf6ce --- /dev/null +++ b/vllm/test_utils.py @@ -0,0 +1,41 @@ +import ray + +from vllm.config import ParallelConfig +from vllm.utils import get_open_port +from vllm.worker.worker import init_distributed_environment + + +def init_test_distributed_environment( + pipeline_parallel_size: int, + tensor_parallel_size: int, + rank: int, + distributed_init_port: str, +) -> None: + parallel_config = ParallelConfig(pipeline_parallel_size, + tensor_parallel_size, + worker_use_ray=True) + distributed_init_method = f"tcp://localhost:{distributed_init_port}" + init_distributed_environment( + parallel_config, + rank, + cupy_port=None, + distributed_init_method=distributed_init_method) + + +def multi_process_tensor_parallel( + tensor_parallel_size: int, + test_target, +) -> None: + # Using ray helps debugging the error when it failed + # as compared to multiprocessing. + ray.init() + + distributed_init_port = get_open_port() + refs = [] + for rank in range(tensor_parallel_size): + refs.append( + test_target.remote(tensor_parallel_size, rank, + distributed_init_port)) + ray.get(refs) + + ray.shutdown() diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42985e3db1b9e26f6b065a2dd4ffce921130e48f GIT binary patch literal 168 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hwera)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_eo0YcUU6D}QEqBcaeQe> lW=^qwe0*kJW=VX!UP0w84x8Nkl+v73JCG&COhAH#0RRsyDMSDO literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/__pycache__/config.cpython-310.pyc b/vllm/transformers_utils/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0393b994400fd9bf69153f97c25e2890be538ae GIT binary patch literal 1408 zcmZ`(OONA35O%kpaUPT5CGBB_uU?Ua$7K)83KVvRNR#Z488O0_jN|TjyxX=zcYBtM z9E4b|d*;F!iC=`|3kOd80S+r>hiWIx1|iyVbyanB)z@FSowwVr2IKCJzs5sH(|%XR z?r4JX3A|zff@lOWEkyxBO2-VRdZ4F9VAQtG%u|h8)TRzR4eAo(Co5<+%!WzY#G)-? z(>8G^qOCg`X>4bg8Vi>+N$UqQ=n(Tn^E-b)EzL)j^LQzfJPVnREBDhv=EFQ&ByrU} z=2Y@9$tc-)AQs*8t-y{hLK(9ZKrLn9PU%9$oY8r(_gWw4gtE#NGUO4c{KLRblPrmr z>7+8#dU0R!LdY4XX)ftZMO6-`mx%y0Ra@DxwlyulQt&J?6zJ^$CN;(D5MJ>!2!cxO zQseIsyqP7icSlSW#66z_B@Ge zS}cI=W8Q#?tF!nzSI2toxaxEIb&+r?yj5O!GWX~=G%6(ZW@aRR_IA!syj+ z2O&&EMY24E8NzD5x50gHwAgVb!VA3!`jDsFuq>C}ib^lMghb(-Q7_5r4WAcroMiDq z7(sC!W@3@^lyc#*WX?mrdQVjl2{hXiR7wc-&i~Wmk573)&sEB!qp_EBuMjFwp*-*P z>hRv?i|8qW&b$Au4NQRftwL<$RWk z_!o`83Hx4-28G}QmPa8Q02_nlN?zpINBs}pABaTK-ZG4yhA|a`OUBZ{E_t&;CQJ-A zsr$=S<;GOPWl&deA5x2lAi9==EQH~)UKu*ZMy)H`{i|ad$VK=D>-fes4eWe}|8$$~ zPCS1bQuuf>@%JhVP78c<46dNH3#6uutNRtat5EiL7dI&T8(1P}58)4Rcr+f?FoVYU z_;jl})3dGCJ-NO5ThANA_zFWz6%O)lh3ncf?z2*?9onDj0>U k38dTu(bBru!Eo$bGOU-lg^_a&o*N6}SC)>ivEI4<8*u}S`Tzg` literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc b/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da9c6b295b47b1a428048c9e88108e896625618a GIT binary patch literal 5729 zcmbVQ&2t<_6`$^zot>Rstz=2IWjhW-62Kc`DG({Tl z-+TQ&Hkq1o4Sdf3`fuIM<wvX>jy&5`)WV>9?6-2t%-*(PuvEbD#H3-|Sny#R$)P zmA>uU{iJHDfFX0O(FeOLFbUcKM&8@gZVP4%08vp?-m>#^ONSv909PwX;(R#b%} zYQlZT6m>Z#8fa5;UNj|_b7u@OEiGA(A5-e4W~P@zg_W z9>#9^aZ~)RkeDtR}Z)9F3$Z=Dw%ostTi6ibLe-Mq}>^pb8@DjHr-Zd|z`Zx1DAdExfMTQUgK?YLby*KfRfHEgd-kg=x=E5yFb zD>y+j8}A>-Jgn^B6RA6*uQeW|8#Uo}JE1VCXcnGIFk zhFR&430OlTb9@CYeIFoag5Bb3Hm~f##Dsa5t7gW8wab3Sgqc?af58}?$oPht8(o%L zzh^g$oad&fyu71vRg`HV>b(d`#=_OGe_tj^NB^cXnA`{TWANqPA20Np*{XhrOK~@OWIflQr$E`k(fXHJ8_n*Nu`>Tp4rRGBPK( zv)V3CN13}<7xlZ08MzJfXpEk}CbQHFy@_PKPNKYQU3%SUQ^p>tqUVuhz0D}}zBTN2 zqqy4%+j7yj(q6LVJHu249hkxL=!;jwI8HJT7|nvPM>t2ZM=OqCi2GuTAL?uLj7Vha z5fga_7VF_KQ8{i8m6CDR+xDXVAW@kVi=(F%Y%Ee(v7HRT7mH+jn3cO9xl+`Vqkk-i z_z^_aOG5nOjuVna@8!;W*L(4G8M)t zk1o%rXekd*fpk{({?acORxiNc`Wpcu|vI zEFZ*0dJu1TonF`-pWy2Cmls!x6EB8Qdwm$jtK;W1ieA-R!ei23^j6n#$$r#b&%Aa> zN&`vMjiXM~4&zLdi)1Ta^sbJNUY;lPKBn0;lc88_H43vUv%>vB*c-~1luDGc@xc7) zj!OE$0FSYBDutP5s%YF8##z)a35({HEK|`M)K@N?$HKzJE05DGjN4Kf_~b4*HBMj>4(>shN)WWCGD`c1Z^!1 zwzKsles1w|&n~4=CNB=c_N}lh)1}Q`ufKFuxxp}tdg;=H`{H1`n1TR;38sk+(lr35 z)?gf8Yv19VTh!mj?*ncEcb`|>drjLiU0_|#A8-r56^lD>Hd%vNpm~_uxo1@yA2@IF z`|cF+vi2=gJ%w$GMlVqzyr%ZO6Ss#Mqk=YZdl?2fBI?jkwlHoQqv;Ie4F>nd$h}2y zaN4n%uNk>{)3A(;Z(o92LL43~WageF7}5V04ikxeRTn3ZM26zq=Cz5=^p{4;WJ_QTrl4h zTw21E6=6wRRFHkxh~|#4MHN|sgABnDr$h}jF6I#Fk+)Q34ZjWKG$rknXd?30|0mjM zF$3DhAs2B1InAt?!){Z@u=C<1uub70WSlOVt7Fe{O(ny@=+YIhOC2wP3ms3Ta9Sa` zmDpR86cB_CER+{`p$G>VT;pQVn9v82DC#mMkE$hJH2UG4ARhLERNhXD85%|LUaxoU#chJ&r1LUyHz{o3r41wARcy0lj2&Tq{(P6M|9kzk1;RK>=yXB~-!M~U}JUHBXC+e!t0F$JO>%ICXm<4co~V;$D)RS8GBC@<@jCPeZs>e^RDaYkBx78tE5MMhPacsq*9tar^)VgFW5q}^?_ z^eljD+6X{p;If##k6^Pu1J~E&qfZ_a%Ex8?NU3N=k|{%xOxd`Tk=E$W56R`iak-Sr z9-RNbrK6t5o~2a2LOuP|zDhlkih6;-F@q(#U!ZBb<8MAJi^fq|eCEH(qBIt`lab`~dj3?aP2SRPFzDjJqgBduWfzrK&c12<=)z7Fj(k3^=Juwc=5N>0Gxqph1Ea;f zqAp;BN)F!4fxEDBhY}UyqeoR5VNC@ylfJtAs5QB%t_ik>b@T=WANN}3%7@G=47hg_ z34Spf_Ik%IR!q|q35Ad%*hIFuRxBWRY)Unj7UUBkFGf~d*x_IxV^LTeNfaZ;)&7(m ziuxvjmjL`)I$R4fc_&N#nS<&m7^Bq7peQO@3%)bK9@+$%Lvayx^`j3EY(?4n!C6bs zsM23yN&0C3t8OuuIodHGJ4N3GY_J9#L)i!9it6jtVu}P&hVDqDMIA-J=GYLXFz7^g zf~37xb?BEK4?`hNgL8JMD6Oj|(V zz>ZTgr&gmh`w-5Vgt231u*t{Z#+Oo7*HCSG4JyzJ+_5csLbk8JPMQ%)00Q|<$8d4X?imY81 zxPG0g%*C?GRpkEs69*U-E*@lY+VlIa#_JMP)oDFF6aNW~J-S0#r7n&IqEhuWqCQ7} zDk61-z*7VcqX$I_tv~f5_2`aAggl8Th4VrgzrIcel~P%`V9fJ|*+6FMFx0w`7$kR( zf(s=15h~mJc7@+_tL_K(S19;6vrUsa){2n-e$gBK= JS*Q8_zW^a>#TWnp literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py new file mode 100644 index 0000000..4a3b3c1 --- /dev/null +++ b/vllm/transformers_utils/config.py @@ -0,0 +1,52 @@ +from typing import Optional + +from transformers import AutoConfig, PretrainedConfig + +from vllm.transformers_utils.configs import * + +_CONFIG_REGISTRY = { + "chatglm": ChatGLMConfig, + "mpt": MPTConfig, + "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) + "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) + "starcoder2": Starcoder2Config, + "minicpm": CPMDragonflyConfig, +} + + +def get_config(model: str, + trust_remote_code: bool, + revision: Optional[str] = None, + code_revision: Optional[str] = None) -> PretrainedConfig: + # FIXME(woosuk): This is a temporary fix for StarCoder2. + # Remove this when the model is supported by HuggingFace transformers. + if "bigcode" in model and "starcoder2" in model: + config_class = _CONFIG_REGISTRY["starcoder2"] + config = config_class.from_pretrained(model, + revision=revision, + code_revision=code_revision) + return config + + try: + config = AutoConfig.from_pretrained( + model, + trust_remote_code=trust_remote_code, + revision=revision, + code_revision=code_revision) + except ValueError as e: + if (not trust_remote_code and + "requires you to execute the configuration file" in str(e)): + err_msg = ( + "Failed to load the model config. If the model is a custom " + "model not yet available in the HuggingFace transformers " + "library, consider setting `trust_remote_code=True` in LLM " + "or using the `--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + if config.model_type in _CONFIG_REGISTRY: + config_class = _CONFIG_REGISTRY[config.model_type] + config = config_class.from_pretrained(model, + revision=revision, + code_revision=code_revision) + return config diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py new file mode 100644 index 0000000..4935680 --- /dev/null +++ b/vllm/transformers_utils/configs/__init__.py @@ -0,0 +1,16 @@ +from vllm.transformers_utils.configs.chatglm import ChatGLMConfig +from vllm.transformers_utils.configs.mpt import MPTConfig +# RWConfig is for the original tiiuae/falcon-40b(-instruct) and +# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the +# `FalconConfig` class from the official HuggingFace transformers library. +from vllm.transformers_utils.configs.falcon import RWConfig +from vllm.transformers_utils.configs.starcoder2 import Starcoder2Config +from vllm.transformers_utils.configs.cpm import CPMDragonflyConfig + +__all__ = [ + "ChatGLMConfig", + "MPTConfig", + "RWConfig", + "Starcoder2Config", + "CPMDragonflyConfig", +] diff --git a/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a4589f76c5fd378d1b1bc186db14c89b9e1aa4a GIT binary patch literal 600 zcmah_O-sW-5Z%p3lhzc(yC*@ZVAhYTh)_WfR*Dsby)0{zroq`H?50BhgLm)V{6#{q zp8N}*++9i#JY-H+grZOT809GKy<<8^gBcoy-?|$b;q`F5;LY{zWL%Z=BG-xj zWctts;r(OFw`UEPNtI^uw9V~|r;{tmbNv}so2)O5Wa%344EP3m8tz70^xy?R#Es;o znpaX}Qn6)IfQl0k*dbJMEMU^fgyTE#ciZnzh!jY#PN6s&clCJmw literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dc0b381e35fcfc8de20bb10a5f43b7c26a54482 GIT binary patch literal 1753 zcmb7^%Woq!7{EP~WRl6FNt!l&ZK1r01P!o4;(#iomAV(G5cD($EH`$N*?JzeXUeAX z0clrSPF%Tg>?w__P$giQ!I=H zzn8?*a5lk(n+*@froRCQ5;&Z|Ax==iV|n!_H>mnG-}CE!!}tBB-}2k=bo>>+TlDVo z-Gdp*`u?ils$p+%yk47x8j$;Ky_>7^kP_S9d`FEAkxC1AO-}0%d!} zLW7V?7!>XD&-t_OfBox^gX6<12P6t{G}$g!3+@2XH7)4+7l3gLp~SdAr=V*pFs_&? zuxe_+n(=_1sRQe#0c;o_=$j_6XDFSsJDq}COFtKa*6BHgNX4;`+72)ZEXzckg-PjtZJS!0Q7PhC zHn%IwPq|9cB+qRB#pY{K5$ZTm5f!NpWfI%EiZlxByDUxRf-0dyo-;`&3EVp=?n)D# zm60!Yc3;eCDFD0ZNys#nknv2}HVV!_dlcG1v9#2IPIC?VznpE?nyMN&&F+?}l$m%c z7UkVkPZ*p(vs>5OsZOR@#2(TxrqlH9M|Kl=n+>*imNW3&1uHB%2{YSDm7q)iL2Q(m zh7W0~k_V;Lwj0KoP*SjSh#44TYdTMbvJDE?g&C!z9b45xPK!vZK4`915d*{;VjVF= zY#=rfTL?^fg}JWq_ffluR}k+YUPbI7&>iYE#OsJR5N{&hLc9&I-s!hY&Gb{p`jk#2 z)0$ERe`~xkI$uy#!#+eJ%vnw zT0rwGlzI#)0Hq7wWwqE(7it6tuRzzI0K6T~x$SY?>OY=)+X~8t&c*PhaH-n+v|E?*o#@Lo4yx%rHEp0nM!!;J2c4e*dFBW>u;s5{u literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/configs/__pycache__/cpm.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/cpm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3519de57611fb2b4ee2f752bd41b1f097e942158 GIT binary patch literal 2716 zcmbW3&yU*rP)_O?>pR#_=K(ysVpAb=!(saocOppXhj^ub9%^+=uJN=xKyqT{D@N?@2ghTWIW4fs;)b#=jS<3aDr)NXxuoYNh)IHP9_PR z`Z;GKTR1Rr!bPA~L<`YIbO6W3M~DT$7Ir_@wx_t_4!q0?N2Vhq{wBate{~2@xnzI!-RJnBX*7ra71-k7}Gdq zJfi!)54Avv8yzMN5rFj4CXL>SC?aG0eb#G6cMZ^X;ek&wNZrolPL9bP<34v;E8f5)q~Q}CZq+ACgMB} z_ax#10wiyUONbHTDa12~XAwI8bc{M3bzoz>@El?bv5j~bgLodhmk}=@(9K6(e-Ukv zu!gb^-}l8--i3NfX#n{nDZOhv*c}&Aj3ZD$G>*dkadwo?(&UBh%g>Erb@;QH9~}Bq zF2^^cC>}#;PUHmU81t*h!$^(;U8doj#oO7DcnQYBVC8E7d&@R|&ptGV=B>5U60U0E z_Q?zfSTs&HY!f)H!6DB8AYVWeK*;X{vId#K%y*5uWLGS~)lsdQYP~Kv%XdxjGMvP# z4}rd)6xH`bmd`jCqz$l1+qB57&EN2?s=@m9U%7{0TJ7FCG_-qAAP&r&dwKI&_`8eHa(bTWb)=FSe#*>nHX+3Nn$x`4j} zL*U0$K&_vnkFCMO?jp8^fIU5n0w;P`ETeTrTe0%&?uoN#Swmo~y2J~t!aI9A0xR&S zkBO4_F{e~D>AbE(yG`kh!jGyeUbFfPV2*ohP;Mm6QyNYozd4j7NVEwTJPA0Ivpg5! zegV}c_A^!2NvUd3h9H^KG!iv*1N9+r6|_K=lR=r`B0myW(e?*`>dez3;88^BsTqC+ zZ$U|j1_0xe0^k@Hh?@{%p6vDm(mHAXe`o6+X_KExhx|mkC`{-~#Hm=&m&RZu!SjLfU&DO7fEQJ=PxWP4I1!Tb-4B#GBN)i-y%t<%Z8 zU8Qj$d|i=sYKuC)wOEv-(`gK=&g%O_sAX#xq3|dLsSKME)&c~^^W#CsGJ|v(h2#|A}7}%=o5zE0zCYE|H0?q z|MAQA!%s)w-Te9R%XfE9DC%KWV|v3lkkCRU;owvU_Yq9YBb1cr9{@^P5>Cp&*Axm! zzG0sY%b_3zQN5BSv>d`ND=k2)v;pn10oW)VK&Na1Hp>>ETebnc(gkdl9$>rd0Cvi5 zW$kT&*1DkSmTQ1(We>1dt^=-@eL$GEnXu+ey(iF9KOfvBw}-+3o1SRgXXW5q`URyx zHmF5&ikX&+T`5!-!idNwAv`KFL*(%%t%OEqNG(`_inf3<;HHX_1@$ zlcKIvs*mQe9HzY};ymJ?6jCjfgPI1|-HnXB8wJ{Qvp@xLEMsIwVm1de@LT#mK=pqBMxt!zKIEM&rE#p0fg;wVg9>=6Eptyl>5#jM^)Fl)zBRqqE=TTP> zo<(>LVH3bO^E-i>X|)c*F*P3NVH{}9xf+7z&s&p1t4RzAjwcYzBwObDY4Xd8SIFDB6JiQa?!N~L*`N3ovN9|k?Cn`I-5jf-8*MYOmDl15`4cE;`L?tk2(>&h**GrH}7OZ5r}V_GMF4C{Y;b5xoK zYWWdtRlZzy*n1pey*KWwCccSwm4|>|a)lkQFvOjzVd|YVR3hw5Z@_?Xdmv25ZDdpu zEF|Zq#rYx?MU3kX2g(LFUd3A_bH?nxz+Q2Je9v5Iz! zT=GSbRYf&!27Mw`zErQ`AzlM8-4&ri8OJM9)d_eBAHj4h3jk@Yw*g#YL-84un^EU4 z=HmT-uiw~c?N@fSMVU3Am96R6Hcie&8glM$p5^4$Q@^R2p$h9;y?3TC$o=YY#(k&G Q)HW7ZV;kB14joyK0E6JkwEzGB literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98e9c19dc5748dc204f62847974135079b1ae15d GIT binary patch literal 10529 zcmcIq+ix7#d7qiRaCf=9le+mfzJ;VsE+yBNY*})wMwX+XrUh8Gfk}?RnpNdRUSpFN zG*)L*u5L}r_i4P(uvs?8=H&`-;=z>tS@@akCRJ?wPH?Z@b$);Y4oWH|}8<+U9C8E?n{t;^Jku6~&cz zyVT}*ap`>@5371Sdrk02IIhpxx8!(hq0{pscZWO7cA|*;G~yqL82!cNNVrkpU%6Mo z+bVX4KE53tba`Bpk6k8$Zmg=^PS>+aH}-7Lw<;~q?bH9`{?^c3E6+-Oj@tnjZ9fnl$Fr?++wpDJw~B&$z3Z!$xad0_ZdIt7_-oC% zz8j&t?}1qkxNu-C$|~mW+iqwJ9=fdOcsRmN6O_<7jF`2Uz4d&zjmd1!?O2tpy(*u! zaRRX-cW9+8$@g6GV)7jR8V_89bZVBtbgKY4DzG9e-7Q$fL>8;~hV0?L&P%+^rr2~M zbu)Ydk~iI6WCaeH#ixkR0zQlQEa7vESJ`oVPC$B3@=10IpT~HeJG6 z)Yubfe-izkLYs-_r_t{jJU@$m6|_IcXWB3PP;2Y#dG?Kur>t4F%qMvrwHVJ;m2|Qho z3$1Q%^U5}!@%M{wTF1TMrVUAMxgqpL%}t^em!d$lcH$X+tLyl(O)6+SDK&uKus!a_ zwf&&wH04_2NhwHnqDLe>>T$+{)Im+{ZV=k2jwf^SAJ>LL9#1N<1Yjz!@V2XbY!BIv!VPy-Jm<&_TQ9w01CF5>rAMPqbUA!^A!p%^>7)dGDqp zw!_wv;)E=Zp(K8hMGb$6VxSK+TN@ZC^+5rpF({%e3`!`AgEGp}U;<@%P(e8{sG_V4 zYACCNNtCrg9p&U;3T1sTjdE%*gK~N>i*jZ#hjMl>k8*CXfO3AY$n>5ru0#vFi_sD@ z?!bbx;*Sin(}SgbO}sN$Vx@~3E6ca*cq?Dj2HI}*vtwk%KR3iKqqS6s0>gTCE1zhr z_Guxd6CJ;!?Vezht~OZuEB#M&9iu-x`ME}30Ww;578fBWJKTh*j+$n({F)G0PgNTAogxzp_WqbHu^u5^{=ZuC%?tNe_oyUeJrot_Ye=)AisR zOsCuRT+YlWko*}k8Xddn+vIQt9Foy#Lc{!R^QMbsr%q&a;>-viM(G{0 zB<4yp2s{YdhdUhd1pYfI;dyfmj#*QcfEMCl z=n>Y;Z4vakFs+9gM)sI3Lz?5?{UnAbuQX=#C^D*qmD0_T!Via03x}RZ64)h&gMIqa zAS_G&vb<(4w_#Rx@Bj%|Rw!-x=v)$vjXQ9X*8@{7(Nvho?Pn9xVG(a!j>GbqxyKK} zR8xlw9V2P9y%hMFD=GZ^u2i0naEw>mm@G0k1D~(Sj#4g?sn+B?YjVbY$LnG8#5x?B z6FfO`R5Ww6j#kx?nI7VZMz68i_={j@1F#Zm3-=8j_VeRlAFJoXFg9_SI5Cgch00 zLWpy8eiC0CW6rsnI3n<9OGay3@?6)Htc^K1GdZNxEn%PA06>ELxV0Uz&4E-?vW)YLDPeTE)g|4H03{Y*q+ol36=8Vq^bxedVHUI z4f8Z=7Lt3ifTqC@$k(<&^8PG(lcr81PJdk zm__Z$;f$~bBC6uVBh0}Gl4mFXHsXSiZt4kT)__+ca`799bI-i`D7ef8ppPuqhh*m{ zu*?x}P{Kz=TMv>wuy_Yh?5NByCqCRJAl7t3PDvuXj4+FYf&P}$+*6tnqCsh$!d{Y& z&x2bkkwhIrq^Z2&>n&&B<-*R92*mi=lAbAH^7b4@+d=-TCC}%9yM_Ag1WdY}YX{dU zk1;Bov>Bz(qbV?RMJXWc-I^(}p47rMGX*Fg5Sk+{B6mjaIni+QCU>`YAhl3Nmtc^x zM~!@X4Q>Rga$WTJ(X~*VB-ew)Th1C}XcHTUH2{Eh1Ow{YcNM^edmu4Y48oGgjW>G~ z@nmsJvOo&Lsp}zi`kIxb=YynQZbgbc*Lwi=sg+WTOxPvu6|EL-00xjCcdrZVW`c}6 z)ar!z$dbp(0Mt}sriX09e5V&7gNZnR-)eci&;?wPVd>UZp0(rP=#cGlBk*$nXUvx7 zW4tCoNU%zRl;BmSb`NwNiTWNrF1dQj!HybNS)=^h*SaLdq(p-${!%KJigg-DDdTa- zJCF{gIp);(66;VW4QFC&9%26Mkd<^la8|kLd_Frr{Yd>Iv*34K!utC95qW)&L+rgy zJLvfg85nu`VFT*ug$-$`l!~`~ydFrlxPChTa57aaAp1zxCuuiCJ*3N{P9;fXrdv+j zUAvX|$I+NOmsF6^wp7T7Hvn-Gk8g(1&pwfA#}zS-YO z5kVF;5#GsC+GO8+z`q0n%gIt$EO;l_2P`(bk;OT62-XCUeN zV?D0H*tPa#n^-`VHQ@zeXt#YajkhcPGYY|CKJv*h5rGdw&7+7^YU@yzXp1EA3w`Uy z+UhBB9ODJ$c?D(1#gkM#MTJSl(^Nb|#j{jAN5%70e1nQ*DpsgirQ!uD=u$(hQL&C9 zt{|yQkwO?3=_#JS%n`jIqQp6VsG8!!Hjm;G_|8RKv6WER@vIFn;WEUY$(7IQvA76# z-2Q)FvS`raH&B4@Roj+1UfUKgqvngvjb11=JmmPj4P0eysDt~x#+lO_$gJ`8E>6kW z=HbRZ1b+hoo*zOYI=H>CrQC0*v{|@;yABAwI1i%4vG6-6%9~~V;r|u%F?8d;Q7C^| zDVBA5pT?)C*HK^oYQ9j{`}3I$HZq2mlhpr=`W{}il@*d}6|jc%uY?E6{C^9dkPdrb ze5QS_UDiIT|DkcaaJz`?zrL@FS>*q56L3Mhfp28X^-WwwmcWWmxLOhyFi^Zn#U&K6 zz8CBJl98(TCRHxthA=vOtM@5dLb~OhEmW{WPg#zYZlr zN}nzT^vL*JU0$g7LR6IRxXO_2rKl|5ahFjHuzZ5 zVx^PxjcbkFdb@C0yQAOH#cSwUrk<$P*#s`eDqz!hP)k=2vgQ-*Q%Os{*G_7}N?X_@ z`dZNxt7jCZ(7rp;zKizhtbGRU{gL)Q+GmsYX*Rc43_nKuH1?s3KgK%dsV%~#%kFG6 zcgGNS>4`01mv_;+NN=AQZ0XZNLZkHAJUhm;&kd&aH1QMkIgUOh^f{6BSqOiMI-Oy) z;ZLK*-6eLConnvCH4W;IvB&p}@bBBCR4;6EAGa;y zji1m2bTbdp*JkzF{r-AR{0~{i#sz7S!Z#Y(ctO(NXJjuMG6SZR{=3WaMJB$NKU!0_ z6zW=Xjj*tKL=q4_DZ2tmbHyo_nmoG6;TTU#hjvEL?I8B~1iq}2%v}pwd4K+We-G|Y zruuLrkL347QK!C!?5jGxu9A89(SXdDn3=zZ6O)KF=cnZl7KTZ=hID@YbISKozU9$I zPthXZ9-kk4bZT!CwW|{3G_t5-80#Np-bx0|R}~{EuaJQC)u*jl*Wb4(cWft8+MkoB znah)0>|a$7Bf>$pf^8pB4U;KhKqv{7UZ=Q+(oM+tw^RaM-YOiXz1)znBAG(sNhL5B z`zOq03a6J%pL0ZzfDTX;*xyF~+_m9#5+OJI{6dCWlr3m-LLOOa5Tgj#EAEKXV3jz7 zA~ui}qN{nUn&p_BccClSho18-AHR`7VA)Zq%yE%q*-AhkZjboZFx`E06qZU*3Xu>& zEkff-ij`3^hc5ADUS&9&jY~(G4tY+>11EX(D|8`qDZ63aKhrqfI7_V0h@Y=@-ER7d zNE!XtNwO|pRDIXi{b0S>b3L~1hJmLTH8KWw`!DW9Q8zrlv4Q8EUbBHK#f{Amu3o-+ zY2(P&8tC)pnU`PYFTeWgtId~MjGaB*WUsu$&z(NYUTL0l&b`)ZzScV9oICT%SusZ( zCqQNuDap0^xVS(?Ma@HWIF>h49GKz*? zygywq0IcwhK7dcQ3?=JWTlfz8!LLd&F^wjI}O`ne2JzId+Nxc+oJ z)ne78(#fQ+H3gBB84-@(U3G;#d3oq`-f;n0o)`uk*Wq!I58NV?mjej@l#;>QRj9(ek#xIQ-ZL^?-y^xgRw{izv*fX_v z$&`#ARy`-Tp(vxf;D7L4p=_` literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/configs/__pycache__/starcoder2.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/starcoder2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..924049ad53587aadc9517920d10a84a434a94c2a GIT binary patch literal 5931 zcmb_g-ESjT6`!%44||hrve^$xDZQi!;w7>3vFUCrx-@K;T~HJDBNWN2)p$H}ZQsp! z#=UnYX)3?9Qv1XMFCc`(3orcxct?VNfG4DREm9SU7v2yOh2Oa|_BhU`DC78K#&hpE z=bn$>Irj$B(! zC|}FDg{MY&>=2`jvKh}ll01~Q@VR?0@SCFL(2L=V7}KSmfN-N`x(1k-wY;0F72JHw ztc?{5#j)aeaiTa`EEY?}OU0?;baAG58K2qWT=9zj%(&M+F`A}3?_T}5SR4PuC>XT~ zsB%4?eHz-*30yAM5*_R>XAPsoaP72(V)zT&oH<&*NZO$YeCBwz!tVi7VIcVl1A->A z*|U1adkYf0`tu6=aY!s9#a)^&d==V$DD03kTMNymunx^(w&MiS6@H7xlPapq9nlnE zV9Qq2;eM!=n9G~O*Ml%3ohY5Q2)PqRl0)u55{7hziq053c8LdEul=dlC9 zJ&0mTbcAQiqZ1RI?2uu>q}#IwoCvk6SXOG)<=YKw_4VE7i|sJ%syD0EcGPM?hoCu?M>NIZ)$E7QhR}8 zH!LMSc(0*n@ZE)qv;X6Z~%9JM{4!an5nRL zC>(eax_;DYa7klb(QI;wfPvd~xlga*y3{z5cO_SNiNI4LUoAxpTb&bkPH z4BKyWRb=6uZ#%X9(XEUV9H4NI#hS0?JEi} zc$51Yv$7?VcsmT?KiJ7?BaF{iVC~ka3Jh4WgcXH2x1Bc2UTEbI@)y%>mmgSrwioe> z>NcXZ(EsRLTIg9wl{l<=a{^*A9Vdmw|Ti_m=>aU{`Z-(bMU<{)}>xRX5x z+!LMOI>E+BV8XowMKoRl+SwN1Q?Q4GB?A#`aqHn`d5MV&2wFXz>4@Us`gpU<0w^K( z1w=o1yxA`>74~x^=#gas`S!RBlM>J&Etko^%xQCHw;LdD2+?iLCHrkDaea$y$6yqV zsoI&^>?|`l_^VJx4nX9HTmVXk+kQ6)G*$pI^a=whPzr1>^&V9q9Zx_dEv&+R)Q47= zhqmxkqAB2-J;PXaL`c`}a+%c`TYez+DuHZOyKb|(w!X4aSzTYdwNb%inHGL=mL|CEgvf5Jw9cSky&3rqrQI25USDW&FS5D^Xc(Nfg`(eH zs2>Y)4rTmhn24j$LWtN)OHV3*E%rr`T*!wb%l#B|`3NL*><_GNpoB&Qjw0Yv8*x4? z+**0#^%GPBJJE?cO!4<4zz=60_glJUfV;>UX%=)us~s|qI2)uOIkv*GGWA`>lqYmk zvoCx%*w0|)h({48!9K?JB?A5tfm9r!;?4E7w2o!|K@Wsx%0q3G7SOUqWwz-B_F08? zrLr44fXpchzimFyhW!^>b(OljB0uu-aG!KP@{HgC943s4d zmfYie=#BxItPW(s2B9L9mB#s7TF1=wjX+tttwD=|6`tK(ZLF+qoD}1jnzk4V4Gw6c zmU=m8F83pFA&Iz5D*r!A<-U8SJQ;Mk1=EN2MZ#qTS7o%z*n#63HcAE1QUrqqKM5Y) zXBkRlOH4D>N;CDiX-5L`=_#2FWYwuoovVi5^MH3d@OhmAW&|wCnQnt_#ByO7DhuZ) z(a9F+&(}LVoy8R31OBWlgNEG@9`I#2fzdJxHv$Iq=l?JApQKD^Z{=fKLs--2r!y#9 z@W4S|N#Ol{G3blQ1Q1FzgiTfAtAGSBni?U^iF$E{C8c#OZEDjz)9K&eEY<6ERLFMV zYaF(+y-eR7|#Lo4hpi!#xcZ$*z%jyoSwhS{5`Q4-I9ve+ZyI z@J2TH1Mvy!^f#Sv4b4o1ESAp@lc1yCHc~8Qv)D`Km*oPCk>Ot^9t%Ty2Zr^K{vDpi=R#+Vxe>99kV~WTM$Tx8^bBIGB<1hyj z&mHS?4hf8{lXkw&K}QoqwOd?O_dKst9mX*$ z3bCE3CW)tp&dVy@1Nj~BeuS^O1u}kP+&ud;nai0uv+!juKmPC1*pylL&xbE%!FDg{ zoccB_&r3R0m6TO-oX7-`Ng_oebkr*;-z0L}kkcSLa)xd-xs1DQiIg@nvQwTUC;4`V zTUK1METBr{(fyQVy^r#kzL~HroT=%ySmHRvfVcHImBY0q z;_1}t4)?s&^4k2eOj>-8NFIcir2t9Bq){*n568`1f$VbL=lb9E?Co;CHZ|-aNqDAT bCBn$z$sI|IC%>O8@~(*|MY5-itNH%|Bn2q5 literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py new file mode 100644 index 0000000..c4244f8 --- /dev/null +++ b/vllm/transformers_utils/configs/chatglm.py @@ -0,0 +1,68 @@ +# coding=utf-8 +# Adapted from +# https://github.com/THUDM/ChatGLM2-6B +from transformers import PretrainedConfig + + +class ChatGLMConfig(PretrainedConfig): + model_type = "chatglm" + attribute_map = { + "num_hidden_layers": "num_layers", + "n_head_kv": "multi_query_group_num", + } + + def __init__(self, + num_layers=28, + padded_vocab_size=65024, + hidden_size=4096, + ffn_hidden_size=13696, + kv_channels=128, + num_attention_heads=32, + seq_length=2048, + hidden_dropout=0.0, + attention_dropout=0.0, + layernorm_epsilon=1e-5, + rmsnorm=True, + apply_residual_connection_post_layernorm=False, + post_layer_norm=True, + add_bias_linear=False, + add_qkv_bias=False, + interleaved_qkv=False, + bias_dropout_fusion=True, + multi_query_attention=False, + multi_query_group_num=1, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=True, + fp32_residual_connection=False, + quantization_bit=0, + pre_seq_len=None, + prefix_projection=False, + **kwargs): + self.num_layers = num_layers + self.vocab_size = padded_vocab_size + self.padded_vocab_size = padded_vocab_size + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.kv_channels = kv_channels + self.num_attention_heads = num_attention_heads + self.seq_length = seq_length + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.layernorm_epsilon = layernorm_epsilon + self.rmsnorm = rmsnorm + self.apply_residual_connection_post_layernorm = ( + apply_residual_connection_post_layernorm) + self.post_layer_norm = post_layer_norm + self.add_bias_linear = add_bias_linear + self.add_qkv_bias = add_qkv_bias + self.bias_dropout_fusion = bias_dropout_fusion + self.multi_query_attention = multi_query_attention + self.multi_query_group_num = multi_query_group_num + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.fp32_residual_connection = fp32_residual_connection + self.quantization_bit = quantization_bit + self.pre_seq_len = pre_seq_len + self.prefix_projection = prefix_projection + self.interleaved_qkv = interleaved_qkv + super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/cpm.py b/vllm/transformers_utils/configs/cpm.py new file mode 100644 index 0000000..da4711a --- /dev/null +++ b/vllm/transformers_utils/configs/cpm.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2022 The OpenBMB team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from transformers.configuration_utils import PretrainedConfig +from typing_extensions import TypedDict + +import math + + +class CPMDragonflyConfig(PretrainedConfig): + model_type = "cpm_dragonfly" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "scale_emb": "scale_emb", + "scale_depth": "scale_depth", + "scale": "scale", + "attention_scale": "attention_scale", + "qk_norm": "qk_norm", + "ffn_gated": "ffn_gated", + } # model specific to common + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + num_attention_heads=32, + num_key_value_heads=32, + dim_head=128, + intermediate_size=11008, + num_hidden_layers=32, + dropout_p=0.0, + hidden_act="silu", + scale=True, + scale_emb: float=1., + scale_depth: float=-1, + dim_model_base:int=None, + rms_norm_eps=1e-5, + init_std=0.02, + half: bool = True, + half_type = 'bf16', + mask_modules: Optional[List[Tuple[bool, bool]]] = None, + use_flash_attn: bool = True, + flash_attn_mask_shape="1d", + flash_impl="cuda", + base=10000, + non_checkpointing_layers_num:int = 0, + attention_scale=1, + qk_norm=False, + ffn_gated=True, + tie_lm_head=False, + max_position_embeddings=2048, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.dim_head = dim_head + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.max_position_embeddings = max_position_embeddings + self.dropout_p = dropout_p + self.hidden_act = hidden_act + self.scale = scale + self.scale_emb = scale_emb + self.half = half + self.half_type = half_type + self.dim_model_base = dim_model_base + self.scale_depth = scale_depth + self.rms_norm_eps = rms_norm_eps + self.init_std = init_std + self.flash_impl = flash_impl + self.mask_modules = mask_modules + self.use_flash_attn = use_flash_attn + self.flash_attn_mask_shape = flash_attn_mask_shape + self.base = base + self.attention_scale=attention_scale + self.qk_norm = qk_norm + self.ffn_gated = ffn_gated + self.non_checkpointing_layers_num = non_checkpointing_layers_num + self.tie_lm_head = tie_lm_head + self.use_bfloat16 = True if self.half_type == 'bf16' else False + super().__init__(architectures=["CPMDragonflyForCausalLM"]) + + @property + def scale_width(self,): + if self.scale: + return self.hidden_size / self.dim_model_base + else: + return 1. + + @property + def scale_states(self,): + if self.scale: + return self.scale_depth / math.sqrt(self.num_hidden_layers) + else: + return 1. \ No newline at end of file diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py new file mode 100644 index 0000000..c82cc60 --- /dev/null +++ b/vllm/transformers_utils/configs/falcon.py @@ -0,0 +1,87 @@ +# Adapted from +# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py +# Copyright 2023 The vLLM team. +# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Falcon configuration""" +from transformers.configuration_utils import PretrainedConfig + + +class RWConfig(PretrainedConfig): + model_type = "falcon" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + "num_kv_heads": "n_head_kv", + } + + def __init__( + self, + vocab_size=250880, + hidden_size=64, + n_layer=2, + n_head=8, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=1, + eos_token_id=2, + hidden_dropout=0.0, + attention_dropout=0.0, + multi_query=True, + n_head_kv=None, + alibi=False, + bias=False, + parallel_attn=False, + new_decoder_architecture=False, + **kwargs, + ) -> None: + self.vocab_size = vocab_size + # Backward compatibility with n_embed kwarg + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.n_layer = n_layer + self.n_head = n_head + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.multi_query = multi_query + self.n_head_kv = 1 if n_head_kv is None else n_head_kv + self.alibi = alibi + self.bias = bias + self.parallel_attn = parallel_attn + self.new_decoder_architecture = new_decoder_architecture + + if self.hidden_size == 8192: + # Hack for falcon-40b + self.new_decoder_architecture = True + + super().__init__(bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs) + + @property + def head_dim(self): + return self.hidden_size // self.n_head + + @property + def rotary(self): + return not self.alibi diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py new file mode 100644 index 0000000..5ea0d91 --- /dev/null +++ b/vllm/transformers_utils/configs/mpt.py @@ -0,0 +1,232 @@ +# coding=utf-8 +# Copied from +# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py +"""A HuggingFace-style model configuration.""" +import warnings +from typing import Any, Dict, Optional, Union +from transformers import PretrainedConfig + +attn_config_defaults: Dict = { + 'attn_type': 'multihead_attention', + 'attn_pdrop': 0.0, + 'attn_impl': 'triton', + 'qk_ln': False, + 'clip_qkv': None, + 'softmax_scale': None, + 'prefix_lm': False, + 'attn_uses_sequence_id': False, + 'alibi': False, + 'alibi_bias_max': 8 +} +ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'} +init_config_defaults: Dict = { + 'name': 'kaiming_normal_', + 'fan_mode': 'fan_in', + 'init_nonlinearity': 'relu', + 'init_div_is_residual': True, + 'emb_init_std': None, + 'emb_init_uniform_lim': None, + 'init_std': None, + 'init_gain': 0.0 +} + + +class MPTConfig(PretrainedConfig): + model_type = 'mpt' + attribute_map = { + 'num_attention_heads': 'n_heads', + 'hidden_size': 'd_model', + 'num_hidden_layers': 'n_layers', + } + + # pylint: disable=dangerous-default-value + def __init__(self, + d_model: int = 2048, + n_heads: int = 16, + n_layers: int = 24, + expansion_ratio: int = 4, + max_seq_len: int = 2048, + vocab_size: int = 50368, + resid_pdrop: float = 0.0, + emb_pdrop: float = 0.0, + learned_pos_emb: bool = True, + attn_config: Dict = attn_config_defaults, + ffn_config: Dict = ffn_config_defaults, + init_device: str = 'cpu', + logit_scale: Optional[Union[float, str]] = None, + no_bias: bool = False, + embedding_fraction: float = 1.0, + norm_type: str = 'low_precision_layernorm', + use_cache: bool = False, + init_config: Dict = init_config_defaults, + fc_type: str = 'torch', + verbose: Optional[int] = None, + **kwargs: Any): + """The MPT configuration class. + Args: + d_model (int): The size of the embedding dimension of the model. + n_heads (int): The number of attention heads. + n_layers (int): The number of layers in the model. + expansion_ratio (int): The ratio of the up/down scale in the ffn. + max_seq_len (int): The maximum sequence length of the model. + vocab_size (int): The size of the vocabulary. + resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. + emb_pdrop (float): The dropout probability for the embedding layer. + learned_pos_emb (bool): Whether to use learned positional embeddings + attn_config (Dict): A dictionary used to configure the model's attention module: + attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention + attn_pdrop (float): The dropout probability for the attention layers. + attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. + qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. + clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to + this value. + softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, + use the default scale of ``1/sqrt(d_keys)``. + prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an + extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix + can attend to one another bi-directionally. Tokens outside the prefix use causal attention. + attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. + When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates + which sub-sequence each token belongs to. + Defaults to ``False`` meaning any provided `sequence_id` will be ignored. + alibi (bool): Whether to use the alibi bias instead of position embeddings. + alibi_bias_max (int): The maximum value of the alibi bias. + kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. + ffn_config (Dict): A dictionary used to configure the model's ffn module: + ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp + init_device (str): The device to use for parameter initialization. + logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. + no_bias (bool): Whether to use bias in all layers. + verbose (int): The verbosity level. 0 is silent. + embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. + norm_type (str): choose type of norm to use + use_cache (bool): Whether or not the model should return the last key/values attentions + init_config (Dict): A dictionary used to configure the model initialization: + init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', + 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or + 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. + init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. + emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. + emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution + used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. + init_std (float): The standard deviation of the normal distribution used to initialize the model, + if using the baseline_ parameter initialization scheme. + init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. + fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. + init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. + --- + See llmfoundry.models.utils.param_init_fns.py for info on other param init config options + fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs. + """ + self.d_model = d_model + self.n_heads = n_heads + self.n_layers = n_layers + self.expansion_ratio = expansion_ratio + self.max_seq_len = max_seq_len + self.vocab_size = vocab_size + self.resid_pdrop = resid_pdrop + self.emb_pdrop = emb_pdrop + self.learned_pos_emb = learned_pos_emb + self.attn_config = attn_config + self.ffn_config = ffn_config + self.init_device = init_device + self.logit_scale = logit_scale + self.no_bias = no_bias + self.embedding_fraction = embedding_fraction + self.norm_type = norm_type + self.use_cache = use_cache + self.init_config = init_config + self.fc_type = fc_type + if verbose is not None: + warnings.warn(DeprecationWarning( + 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' + ), + stacklevel=2) + if 'name' in kwargs: + del kwargs['name'] + if 'loss_fn' in kwargs: + del kwargs['loss_fn'] + if self.attn_config.get('alibi', False): + self.learned_pos_emb = False + warnings.warn( + f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', + stacklevel=2) + super().__init__(**kwargs) + self._validate_config() + + def _set_config_defaults( + self, config: Dict[str, Any], + config_defaults: Dict[str, Any]) -> Dict[str, Any]: + for (k, v) in config_defaults.items(): + if k not in config: + config[k] = v + return config + + def _validate_config(self) -> None: + self.attn_config = self._set_config_defaults(self.attn_config, + attn_config_defaults) + self.ffn_config = self._set_config_defaults(self.ffn_config, + ffn_config_defaults) + self.init_config = self._set_config_defaults(self.init_config, + init_config_defaults) + if self.d_model % self.n_heads != 0: + raise ValueError('d_model must be divisible by n_heads') + if any(( + prob < 0 or prob > 1 for prob in + [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] + )): + raise ValueError( + "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long + ) + if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: + raise ValueError( + f"Unknown attn_impl={self.attn_config['attn_impl']}") + if self.attn_config['prefix_lm'] and self.attn_config[ + 'attn_impl'] not in ['torch', 'triton']: + raise NotImplementedError( + 'prefix_lm only implemented with torch and triton attention.') + if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [ + 'torch', 'triton' + ]: + raise NotImplementedError( + 'alibi only implemented with torch and triton attention.') + if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ + 'attn_impl'] not in ['torch', 'triton']: + raise NotImplementedError( + 'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long + ) + if self.embedding_fraction > 1 or self.embedding_fraction <= 0: + raise ValueError( + 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long + ) + if isinstance(self.logit_scale, + str) and self.logit_scale != 'inv_sqrt_d_model': + raise ValueError( + f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long + ) + if self.init_config.get('name', None) is None: + raise ValueError( + f"self.init_config={self.init_config!r} 'name' needs to be set." + ) + if not self.learned_pos_emb and (not self.attn_config['alibi']): + warnings.warn( + 'Positional information not being provided to the model.', + stacklevel=2) + if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp': + try: + # pylint: disable=import-outside-toplevel + import transformer_engine.pytorch as te + del te + except Exception as exc: + raise ImportError( + # pylint: disable=line-too-long + 'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' + + + 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' + + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + + 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' + ) from exc + if self.ffn_config['ffn_type'] == 'mptmlp': + self.ffn_config['fc_type'] = self.fc_type + elif self.ffn_config['ffn_type'] == 'te_ln_mlp': + self.ffn_config['bias'] = not self.no_bias diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py new file mode 100644 index 0000000..4c3b6b8 --- /dev/null +++ b/vllm/transformers_utils/configs/starcoder2.py @@ -0,0 +1,127 @@ +from transformers import PretrainedConfig + + +class Starcoder2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a + Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model. + + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 49152): + Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Starcoder2Model`] + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 12288): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 30): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 24): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + norm_epsilon (`float`, *optional*, defaults to 1e-05): + Epsilon value for the layer norm + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + bos_token_id (`int`, *optional*, defaults to 50256): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 50256): + The id of the "end-of-sequence" token. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*): + Sliding window attention window size. If not specified, will default to `None` (no sliding window). + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + residual_dropout (`float`, *optional*, defaults to 0.0): + Residual connection dropout value. + embedding_dropout (`float`, *optional*, defaults to 0.0): + Embedding dropout. + use_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias term on linear layers of the model. + + + ```python + >>> from transformers import Starcoder2Model, Starcoder2Config + + >>> # Initializing a Starcoder2 7B style configuration + >>> configuration = Starcoder2Config() + + >>> # Initializing a model from the Starcoder2 7B style configuration + >>> model = Starcoder2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "starcoder2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=49152, + hidden_size=3072, + intermediate_size=12288, + num_hidden_layers=30, + num_attention_heads=24, + num_key_value_heads=2, + hidden_act="gelu_pytorch_tanh", + max_position_embeddings=4096, + initializer_range=0.018042, + norm_epsilon=1e-5, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + rope_theta=10000.0, + sliding_window=None, + attention_dropout=0.0, + residual_dropout=0.0, + embedding_dropout=0.0, + use_bias=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + self.use_bias = use_bias + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.norm_epsilon = norm_epsilon + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.residual_dropout = residual_dropout + self.embedding_dropout = embedding_dropout + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + if self.architectures is None: + self.architectures = ['Starcoder2ForCausalLM'] diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py new file mode 100644 index 0000000..6edc225 --- /dev/null +++ b/vllm/transformers_utils/tokenizer.py @@ -0,0 +1,245 @@ +from typing import List, Optional, Tuple, Union + +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) + +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.utils import make_async, LRUCache +from vllm.transformers_utils.tokenizers import * + +logger = init_logger(__name__) + + +def get_tokenizer( + tokenizer_name: str, + *args, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + tokenizer_revision: Optional[str] = None, + **kwargs, +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Gets a tokenizer for the given model name via Huggingface.""" + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError( + "Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + + try: + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + *args, + trust_remote_code=trust_remote_code, + tokenizer_revision=tokenizer_revision, + **kwargs) + except ValueError as e: + # If the error pertains to the tokenizer class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + if (not trust_remote_code and + ("does not exist or is not currently imported." in str(e) + or "requires you to execute the tokenizer file" in str(e))): + err_msg = ( + "Failed to load the tokenizer. If the tokenizer is a custom " + "tokenizer not yet available in the HuggingFace transformers " + "library, consider setting `trust_remote_code=True` in LLM " + "or using the `--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + except AttributeError as e: + if "BaichuanTokenizer" in str(e): + # This is for the error "'BaichuanTokenizer' object has no + # attribute 'sp_model'". + tokenizer = BaichuanTokenizer.from_pretrained( + tokenizer_name, + *args, + trust_remote_code=trust_remote_code, + tokenizer_revision=tokenizer_revision, + **kwargs) + else: + raise e + + if not isinstance(tokenizer, PreTrainedTokenizerFast): + logger.warning( + "Using a slow tokenizer. This might cause a significant " + "slowdown. Consider using a fast tokenizer instead.") + return tokenizer + + +def get_lora_tokenizer(lora_request: LoRARequest, *args, + **kwargs) -> Optional[PreTrainedTokenizer]: + if lora_request is None: + return None + try: + tokenizer = get_tokenizer(lora_request.lora_local_path, *args, + **kwargs) + except OSError as e: + # No tokenizer was found in the LoRA folder, + # use base model tokenizer + logger.warning( + f"No tokenizer found in {lora_request.lora_local_path}, " + "using base model tokenizer instead. " + f"(Exception: {str(e)})") + tokenizer = None + return tokenizer + + +get_lora_tokenizer_async = make_async(get_lora_tokenizer) + + +class TokenizerGroup: + """A group of tokenizers that can be used for LoRA adapters.""" + + def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, + max_input_length: Optional[int], **tokenizer_config): + self.tokenizer_id = tokenizer_id + self.tokenizer_config = tokenizer_config + self.enable_lora = enable_lora + self.max_input_length = max_input_length + self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) + if enable_lora: + self.lora_tokenizers = LRUCache(capacity=max_num_seqs) + else: + self.lora_tokenizers = None + + def encode(self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None) -> List[int]: + tokenizer = self.get_lora_tokenizer(lora_request) + return tokenizer.encode(prompt) + + async def encode_async( + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None) -> List[int]: + tokenizer = await self.get_lora_tokenizer_async(lora_request) + return tokenizer.encode(prompt) + + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + if not lora_request or not self.enable_lora: + return self.tokenizer + if lora_request.lora_int_id not in self.lora_tokenizers: + tokenizer = (get_lora_tokenizer( + lora_request, **self.tokenizer_config) or self.tokenizer) + self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) + return tokenizer + else: + return self.lora_tokenizers.get(lora_request.lora_int_id) + + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + if not lora_request or not self.enable_lora: + return self.tokenizer + if lora_request.lora_int_id not in self.lora_tokenizers: + tokenizer = (await get_lora_tokenizer_async( + lora_request, **self.tokenizer_config) or self.tokenizer) + self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) + return tokenizer + else: + return self.lora_tokenizers.get(lora_request.lora_int_id) + + +def _convert_tokens_to_string_with_added_encoders( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + output_tokens: List[str], + skip_special_tokens: bool, + spaces_between_special_tokens: bool, +) -> str: + # Adapted from + # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 + # NOTE(woosuk): The following code is slow because it runs a for loop over + # the output_tokens. In Python, running a for loop over a list can be slow + # even when the loop body is very simple. + sub_texts = [] + current_sub_text = [] + all_special_tokens = set(tokenizer.all_special_tokens) + for token in output_tokens: + if skip_special_tokens and token in all_special_tokens: + continue + if token in tokenizer.get_added_vocab(): + if current_sub_text: + sub_text = tokenizer.convert_tokens_to_string(current_sub_text) + sub_texts.append(sub_text) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_text = tokenizer.convert_tokens_to_string(current_sub_text) + sub_texts.append(sub_text) + if spaces_between_special_tokens: + return " ".join(sub_texts) + else: + return "".join(sub_texts) + + +# Based on +# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 +# under Apache 2.0 license +def detokenize_incrementally( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + all_input_ids: List[int], + prev_tokens: Optional[List[str]], + prefix_offset: int = 0, + read_offset: int = 0, + skip_special_tokens: bool = False, + spaces_between_special_tokens: bool = True, +) -> Tuple[List[str], str, int, int]: + new_token_id = all_input_ids[-1] + # This is the first iteration for this sequence + if prev_tokens is None: + new_tokens = tokenizer.convert_ids_to_tokens( + all_input_ids, skip_special_tokens=skip_special_tokens) + output_tokens = new_tokens + # 5 is an arbitrary value that should work for all + # tokenizers (bigger = more conservative). + # Subtract 1 extra to account for the generated token. + prefix_offset = max(len(output_tokens) - 6, 0) + # If the first new token is a special token, we can't skip 1 extra token + if skip_special_tokens and new_token_id in tokenizer.all_special_ids: + read_offset = max(len(output_tokens), 0) + else: + read_offset = max(len(output_tokens) - 1, 0) + else: + # Put new_token_id in a list so skip_special_tokens is respected + new_tokens = tokenizer.convert_ids_to_tokens( + [new_token_id], skip_special_tokens=skip_special_tokens) + output_tokens = prev_tokens + new_tokens + + # The prefix text is necessary only to defeat cleanup algorithms in + # the decode which decide to add a space or not depending on the + # surrounding ids. + if tokenizer.is_fast or not tokenizer.get_added_vocab(): + prefix_text = tokenizer.convert_tokens_to_string( + output_tokens[prefix_offset:read_offset]) + new_text = tokenizer.convert_tokens_to_string( + output_tokens[prefix_offset:]) + else: + prefix_text = _convert_tokens_to_string_with_added_encoders( + tokenizer, + output_tokens[prefix_offset:read_offset], + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + new_text = _convert_tokens_to_string_with_added_encoders( + tokenizer, + output_tokens[prefix_offset:], + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + + if len(new_text) > len(prefix_text) and not new_text.endswith("�"): + # utf-8 char at the end means it's a potential unfinished byte sequence + # from byte fallback tokenization. + # If it's in the middle, it's probably a real invalid id generated + # by the model + new_text = new_text[len(prefix_text):] + return new_tokens, new_text, read_offset, len(output_tokens) + else: + return new_tokens, "", prefix_offset, read_offset diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py new file mode 100644 index 0000000..e6b5972 --- /dev/null +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -0,0 +1,5 @@ +from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer + +__all__ = [ + "BaichuanTokenizer", +] diff --git a/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22d5db7855ce9b913000d284d0a32bc9db97409a GIT binary patch literal 288 zcmZXN!Ait15QdX>(M8;Y7jHg55i^Qc5%B@MC?0wlvQ6xUohBud3hRsb3_g+2t0!N< zi>8aB;K2OD&&)qxSSImfhJ;LLFNxs<5S-PUTo=4h!Q86&YH$Z{x&vV~~5!jjpId#yB^Q_W#h zJ=3#QJ>+sQL;^^F_p&zN9CC;QP(Us^=kAmJ9|Q={hg^KI{{cJE?n?Q+>YfjN$eTlu z8BBF`eY|@0>b+mR_a+`2D{1&#{-^(_uf3ybzonPqkBOJR!cTsVf@@rlG?$(%(i_ZW zI@OKHXqc|quw1KQyLMy59cdKYg6h+wqFcfkGa7A_-Ew2h9aHUAG~SqSCv+{CIGhZo zp6Tv1xA{n@yKjVAsPV!tnL86|?wKEIyvR#WHC_tFo*3>~uCHm8(cgm`ZMCAOg<9O& z-wLCkVx`7%b3ZljhP5O$AA~YV3m>$SFmC!$YOS|h7{k)iGUowbj~@rk^o@riSQmcS z%p1Z^AV6{?itF{TSx-y5an0ZKh|U&9|IT{OtI zbxjL2SLd3`xb7N(=~{v9jsyj_$Qd{4%qrr1Rvw$Pc?UvPw+`Rr-Erd#iyTYPasA92A@IO3_ruq;(3P8@^g5e z<>&bYJZJeuehJTW#S#7GJ&TIV5;5T|eboQZPqwj?lWYgs*mmTLeTOuos))G7u-R%S zUdW|8?k7pmB*pU@zI^<`y4h|%zV*VmDQ~?nZ!XANtLyjD3EA?LB6*MZeDG51DGDhM zX;$wi{}V4=n`>=N*rC?cyLMOev_$V3346lCnZ)QCT*sGvVW8JIFwkeJzN_dnhx*I| z6ManeEujyJ*q_G^82T*L_XG4czuSSAVAKS~B`X+bMIi3m&?xYhf%#j;{_$w)|Qk zpKFz|Y*;CjAvDcz)`HXkE24nO%G$*&)y@$uL$(^}*)^PVfN>AQ0ELJ_N5^bQv| z6#G22WV;oh+4I6?n0Q`gG&N-qZDmBy5YacNn4#h<6~ymgHE|xVshw%cb3HA3UM=#a z^gM9^Jug05XiKpW0j;8iDBN6V?I+uD^E=h0#RVC{0I%Td`;a zLVE2ajO0S!isZs(Zhh6(zIYqNui+=}qL>~l7?ZCUV_z3WPJh~#t=syHUe*gb8~U24 zoBzd%r9XBq^bEIe+|`~?Y5>UohoNTRcLzU75+Thw)Sqbw0MIV9_9jYb4qFv(p%>Z% z;c_6q=cOiH3Bc#8=z!=GKM9nWvf|s+TK2qpkf<)tdxREgqbPxIQ}4Wa8jpkFHR7%J zp*mfWiqCmel%2pjcVJt}HjLahOzglsv{2e)9ZK)PJ)K`=M9kqKE>rO)73%nDW5S_= zu%Ndgou9H9+2712hN{ibK}^49GlStrgiZ%Q8&aDgJn9J@g4R3J9|lQVG-U>Wjs!A; zCt}hahjhFa*?>pQ&vhmH<9z1VDUGOOmZYUhM(5H=ac6K$1CQ`W6O8;PNn+}*0q{L7 zB--cNj{Zcyi@WbY@9KvP?*<84sfp2{GU;WyhwwXKHbCHxLT<0V)};NN6LgE~a`Ys-TgHL8)Eordl#?v~D$N3EI#( zt+Q}a0who+h3ZXZ*78<$Ft=i+7Hqg4hy@Tx?RJYp$K4qqV35P1^Z-lnQ$(&duBE7D zyeO^|L&g3&5%EJtaE_J92rm8!o?dSOq_?CrJCIZY3-|)Es<6(*p78ztea#&Ld0fX8 z-7UnpIg|ZXP%)G~rFPEzC6Mdsy_p1`CC62&*hBI_5dS34Ni27g_x`vOypuvHg)gI$ zDLkFDQu#%=9CW;-ED9o>IdENZ!lvUlosc_i+!XbJS5WO4#As#=(1?)tHee|V$9FK} z^;Af8lM;n|x3`%x>%4o)W=@%MWM8zW{=Rx=a)>i|rXR2_or&K0{$Ov|N|OhlIddUy zGiX(|!6Q3<-A0S|;Y+LA4`+83%chp1CH@li*Y7nXHO z!DO=9J4C6AqOKi4_YMrs;P1e02^Jnze!>o{1Dm|39er05^IcP3hL>YJHpFFkQ08~F z19(#CWry&LVC(j*Ms3_WFp1))r7k-(yBT_eGTw(qXR{$$yK`e8Sw{w+w#h+o;;l@c zQmGKMCKoa(P!#lxA=T~)Lg5#>vyOnK7&P0srOCLb{V4KeE2xEjlnGAew!#B4^wjoS zt)R(qm#AZ~l{!gxy4Dr~UWzB%oB2GqMD~`MpF7bK!LAp|9&NFNPC&;mG`i;N?cU$3Sa0%7jQzNDB zeF&_`GgHU@@2Dg=*h5B2E~gUX5RPwG|7S$1|Dx^zZmeY^$dzT{BAj1y7Vb2+5}VuS zwOQ>MgX>H*;NTdvxKH`e(_Dh4P(JwzpZCNLegXlO_VBB({Nw1Cc%)b1M)6Zo9b?hs|{A zqYv&Z-}dg^f3ULVtuDX6vX+kcn+Ot;?TnZmO5#t@ZIhZYtS<9{&dNZ*I^nhOJhW?x zIEhnhPrwPU*vc^7@uRX9eGToYtDdR=eo089mrViGMz-a?_EKC5+Mcd!M9kX9G(ATTu0#`^8nn)X~@ zt(YqONz46^4)l5*}w>bQ=Bt8Oj;$sx44Jji?KQ2ROvmHhp zp&9z5w-+YcJuu7x^WQL8l6x=>kGjkX41mTue|Az}-$Y3Kn0RrCmkWL*0-x`D+kPK= z^W|!E8}>ega&);F_{o1j(KWfx~Tx3eF0>Hl$n@97AccGVU8ir_uom{CK3qO zkPfiwdScS0I0MY?BG)pw>Tm|mmWUg$M|CJp zgj^iV+D{y8)@cGPj-r+FTdM$0ez}w=Ky6LZZZ2rJGQR~3~ zg|mU1qveahL!g^k<3>(<2$oI);gA*q1u2R637n!w$MZo1N7PBz`u)3dfKk*jps{p> z#=LVa&hwGij^&qVS}#`#LBjHRCbf*dgG~(N+d?|)*pjbTCUCGB$I-^{;`0sp6bB#5~Pa05&B3 z(=UaA%x^f$eIewYsU+8%XsPhW`DAh4S$cHCSyBS>k1jm#EcTjvPMG3;a9Hc`S^E|# zC=Kv?a=uoT3s$kj6}+Ug6e1mUyi{6mkex{lI4e24AEP$_AMq^^%aDg6FX5s16@GFa zg?R=Lt-kixllG)uU}tp(CyG(Jb^r+(F`G!O6>s< z@)MFy3u!Ir5%dxldYh^gZ1o2ql@$s{CBMS0u%;4tP6B#WXSwMtKe)Z};rak3dgrbt zsZ|MpXwj`WisC&A-ZI_2k!6L+4mUP7`V=L~$-jqT2 zHtBTx<7ut_OPGfUxYeY$L=dQ2J6u7Gzx{)VW1j8M3hd z6MO?V1CyZ*{XgMbUoPK_6Fn&YB!eePB@iq>nyXe9>QO8ulJ!?r-*xJ{jxALjzgCL{ zT`Qz)q!aaU7ueQ2r8$x$ZI?a@5mv?D!EAsFDhsUK3d#{F-$IZ@iNB?SGI|3;Pv5C2 zz0dA;mE}<9OBbZqTTA{9b21NOc3M9R590!P7=Lln(*HNaDg`k{Jd9H@fnpVZINTF| zgBtRs_bP8wu2e-=URv_#a%@Kd@}@|Ne%khHzs<|?~@bJTx z^$(ZtuddwnhF#|Ommj(paQh<7O#iG6xwR6&*-GCXn)mR7wfpP$KUiIU;H|uWd*$xk z`>Q`(^VaUWD{JnVhW}aaM`!=7P(Dz)Q`x^JSrhRp1^GyVoF+k0s32EKrLM($RJ)EM zEx;8=3O?CaDKqis)bTwkZcuTPiaS(LE;A;Wm z@+=LX9lFAK1@YuD5<_wuML|PsT%hd>U18%G`IU_ep@a*8;X<&j@h{nX*_e_VH~mu3s+C#sBRo=DZc~0gi&Si= zKD0d(kCGPcE)yPY+Z~hnzZojtP}ykrt>Hfls>+0Vf19X(4zmA7iZp%t$k42plE`j} zuBR;05}UN-UsFM9CAO%jQ$cJGnT|3dl44r8nJMTkx|$`m7e>nZCESsd6uuY*hWk;s JUz+BO@jshTCx-w4 literal 0 HcmV?d00001 diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py new file mode 100644 index 0000000..1dd241e --- /dev/null +++ b/vllm/transformers_utils/tokenizers/baichuan.py @@ -0,0 +1,263 @@ +# yapf: disable +# Adapted from +# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py +# This includes a fix suggested in +# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058 +# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {}, + "tokenizer_file": {}, +} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + + +class BaichuanTokenizer(PreTrainedTokenizer): + """ + Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False) + if isinstance(pad_token, str) + else pad_token + ) + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for i, token in enumerate(tokens): + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special and i != 0: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string + + def save_vocabulary( + self, save_directory, filename_prefix: Optional[str] = None + ) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + + VOCAB_FILES_NAMES["vocab_file"], + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath( + out_vocab_file + ) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False, + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True, + ) + + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] + + if token_ids_1 is None: + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return ( + bos_token_id + + ([0] * len(token_ids_0)) + + eos_token_id + + bos_token_id + + ([0] * len(token_ids_1)) + + eos_token_id + ) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) + + if token_ids_1 is not None: + output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) + + return output diff --git a/vllm/utils.py b/vllm/utils.py new file mode 100644 index 0000000..a4f9bfe --- /dev/null +++ b/vllm/utils.py @@ -0,0 +1,311 @@ +import enum +import os +import socket +import subprocess +import uuid +from platform import uname +from typing import List, Tuple, Union +from packaging.version import parse, Version + +import psutil +import torch +import asyncio +from functools import partial +from typing import ( + Awaitable, + Callable, + TypeVar, +) +from collections import OrderedDict +from typing import Any, Hashable, Optional + +from vllm.logger import init_logger + +T = TypeVar("T") +logger = init_logger(__name__) + +STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.half, + "bfloat16": torch.bfloat16, + "float": torch.float, + "fp8_e5m2": torch.uint8, +} + + +class Device(enum.Enum): + GPU = enum.auto() + CPU = enum.auto() + + +class Counter: + + def __init__(self, start: int = 0) -> None: + self.counter = start + + def __next__(self) -> int: + i = self.counter + self.counter += 1 + return i + + def reset(self) -> None: + self.counter = 0 + + +class LRUCache: + + def __init__(self, capacity: int): + self.cache = OrderedDict() + self.capacity = capacity + + def __contains__(self, key: Hashable) -> bool: + return key in self.cache + + def __len__(self) -> int: + return len(self.cache) + + def __getitem__(self, key: Hashable) -> Any: + return self.get(key) + + def __setitem__(self, key: Hashable, value: Any) -> None: + self.put(key, value) + + def __delitem__(self, key: Hashable) -> None: + self.pop(key) + + def touch(self, key: Hashable) -> None: + self.cache.move_to_end(key) + + def get(self, key: Hashable, default_value: Optional[Any] = None) -> int: + if key in self.cache: + value = self.cache[key] + self.cache.move_to_end(key) + else: + value = default_value + return value + + def put(self, key: Hashable, value: Any) -> None: + self.cache[key] = value + self.cache.move_to_end(key) + self._remove_old_if_needed() + + def _on_remove(self, key: Hashable, value: Any): + pass + + def remove_oldest(self): + if not self.cache: + return + key, value = self.cache.popitem(last=False) + self._on_remove(key, value) + + def _remove_old_if_needed(self) -> None: + while len(self.cache) > self.capacity: + self.remove_oldest() + + def pop(self, key: int, default_value: Optional[Any] = None) -> Any: + run_on_remove = key in self.cache + value = self.cache.pop(key, default_value) + if run_on_remove: + self._on_remove(key, value) + return value + + def clear(self): + while len(self.cache) > 0: + self.remove_oldest() + self.cache.clear() + + +def is_hip() -> bool: + return torch.version.hip is not None + + +def is_neuron() -> bool: + try: + import transformers_neuronx + except ImportError: + transformers_neuronx = None + return transformers_neuronx is not None + + +def get_max_shared_memory_bytes(gpu: int = 0) -> int: + """Returns the maximum shared memory per thread block in bytes.""" + # NOTE: This import statement should be executed lazily since + # the Neuron-X backend does not have the `cuda_utils` module. + from vllm._C import cuda_utils + + max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( + gpu) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail + assert max_shared_mem > 0, "max_shared_mem can not be zero" + return int(max_shared_mem) + + +def get_cpu_memory() -> int: + """Returns the total CPU memory of the node in bytes.""" + return psutil.virtual_memory().total + + +def random_uuid() -> str: + return str(uuid.uuid4().hex) + + +def in_wsl() -> bool: + # Reference: https://github.com/microsoft/WSL/issues/4071 + return "microsoft" in " ".join(uname()).lower() + + +def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: + """Take a blocking function, and run it on in an executor thread. + + This function prevents the blocking function from blocking the + asyncio event loop. + The code in this function needs to be thread safe. + """ + + def _async_wrapper(*args, **kwargs) -> asyncio.Future: + loop = asyncio.get_event_loop() + p_func = partial(func, *args, **kwargs) + return loop.run_in_executor(executor=None, func=p_func) + + return _async_wrapper + + +def get_ip() -> str: + # try ipv4 + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + s.connect(("dns.google", 80)) # Doesn't need to be reachable + return s.getsockname()[0] + except OSError: + # try ipv6 + s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + s.connect(("dns.google", 80)) + return s.getsockname()[0] + + +def get_distributed_init_method(ip: str, port: int) -> str: + return f"tcp://{ip}:{port}" + + +def get_open_port() -> int: + # try ipv4 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + except OSError: + # try ipv6 + with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def set_cuda_visible_devices(device_ids: List[int]) -> None: + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids)) + + +def get_nvcc_cuda_version() -> Optional[Version]: + cuda_home = os.environ.get('CUDA_HOME') + if not cuda_home: + cuda_home = '/usr/local/cuda' + if os.path.isfile(cuda_home + '/bin/nvcc'): + logger.info( + f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' + ) + else: + logger.warning( + f'Not found nvcc in {cuda_home}. Skip cuda version check!') + return None + nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], + universal_newlines=True) + output = nvcc_output.split() + release_idx = output.index("release") + 1 + nvcc_cuda_version = parse(output[release_idx].split(",")[0]) + return nvcc_cuda_version + + +def _generate_random_fp8_e5m2( + tensor: torch.tensor, + low: float, + high: float, +) -> None: + # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type, + # it may occur Inf or NaN if we directly use torch.randint + # to generate random data for fp8 data. + # For example, s.11111.00 in fp8e5m2 format represents Inf. + # | E4M3 | E5M2 + #-----|-------------|------------------- + # Inf | N/A | s.11111.00 + # NaN | s.1111.111 | s.11111.{01,10,11} + from vllm._C import cache_ops + tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) + tensor_tmp.uniform_(low, high) + cache_ops.convert_fp8_e5m2(tensor_tmp, tensor) + del tensor_tmp + + +def create_kv_caches_with_random( + num_blocks: int, + block_size: int, + num_layers: int, + num_heads: int, + head_size: int, + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, + seed: Optional[int] = 0, + device: Optional[str] = "cuda", +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + if isinstance(cache_dtype, str): + if cache_dtype == "auto": + if isinstance(model_dtype, str): + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] + elif isinstance(model_dtype, torch.dtype): + torch_dtype = model_dtype + else: + raise ValueError(f"Invalid model dtype: {model_dtype}") + elif cache_dtype in ["half", "bfloat16", "float"]: + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] + elif cache_dtype == "fp8_e5m2": + torch_dtype = torch.uint8 + else: + raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") + elif isinstance(cache_dtype, torch.dtype): + torch_dtype = cache_dtype + else: + raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") + + scale = head_size**-0.5 + x = 16 // torch.tensor([], dtype=torch_dtype).element_size() + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + key_caches = [] + for _ in range(num_layers): + key_cache = torch.empty(size=key_cache_shape, + dtype=torch_dtype, + device=device) + if cache_dtype == 'fp8_e5m2': + _generate_random_fp8_e5m2(key_cache, -scale, scale) + elif torch_dtype in [torch.half, torch.bfloat16, torch.float]: + key_cache.uniform_(-scale, scale) + else: + raise ValueError( + f"Does not support key cache of type {cache_dtype}") + key_caches.append(key_cache) + + value_cache_shape = (num_blocks, num_heads, head_size, block_size) + value_caches = [] + for _ in range(num_layers): + value_cache = torch.empty(size=value_cache_shape, + dtype=torch_dtype, + device=device) + if cache_dtype == 'fp8_e5m2': + _generate_random_fp8_e5m2(value_cache, -scale, scale) + elif torch_dtype in [torch.half, torch.bfloat16, torch.float]: + value_cache.uniform_(-scale, scale) + else: + raise ValueError( + f"Does not support value cache of type {cache_dtype}") + value_caches.append(value_cache) + return key_caches, value_caches diff --git a/vllm/worker/__init__.py b/vllm/worker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm/worker/__pycache__/__init__.cpython-310.pyc b/vllm/worker/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f8529abc8de367b37a175cd616cb83ceaed661d GIT binary patch literal 156 zcmd1j<>g`k0@>-;(?RrO5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!H$era)$eolUJ zVvc@JW|DqEWl2VUp0S>xfqrpjNvdu^Vsdt3dTOzLSx!!_etCXTc50D+e0*kJW=VX! ZUP0w84x8Nkl+v73JCMP}OhAH#0RZ`?Bpd($ literal 0 HcmV?d00001 diff --git a/vllm/worker/__pycache__/cache_engine.cpython-310.pyc b/vllm/worker/__pycache__/cache_engine.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dca2fa86d2d076756a0b7dfeafeb8335040da85d GIT binary patch literal 5608 zcmb7I&2JmW72la%F3BY&Q4*~$CpMEhao8kMK9Hs#t{X>|olnKJWT)9S*sM7tiPCb( z%r2!$SfWMcLoY_rQ!hm!6}|Xcpg@84&_n-)w!mI{%DIOEP15xDW=WA!oI_Wdw{Pan zzL|O7!}i#iW8iuGSO3_&c)>9KNuANhLgzKK=yw3Z5G*iULmEd+aKVnb%ZoH>24>50 ztya#>wQSdB+049~2h9o!t)g3OIj++xxg|}@1?AS5JEn0v7;jCu6RnC{X-&G5%#dSp z>X5n9(w6z{DzyzhZ@A}f8X_+W_YF}vh!jiVg345cU&u>Ah zIa!d!>~F)GH#WE^3I`H!hLjCck7tr8K}$JVmScKsOpIgh_{hkFs9xcfPjB4?gti64=)*u}&I!+DG`c^Zl zH3C11YCx2XI_)rO-V0=HqpfPB!KNQH51Qd-%@0MbMVzQZtZv@AqX(Dqe*3Owr&&jf zwKwCMA4Kh1N40mG0!z1Kyww(w=Gu`8%v;1}u$x*NgMQq^;?LEhZevSx^P^g{=XW}T z)f?^3KJZ0NOv`#PuaOyJxkbh}=uLF`);Jijm5jS8Ty2oA4j#fyv}LsDuK)>449`e7 zAWKX@p70~{*Z^h$%O&P9&9H&z`*}^S3cv15}ro|yTPL#0Q@&bQf zswj0v@XCO;v#URj(jrmmV@$k3VuBN_D?+(pQuz(o%Lz5$|_on$s^u zAuOFjh=jmmsafb*X_2^P=#Wet`{WZ;P17N(0It0jE9tieJ`m}iJKpFj1#Nj*Kg}sW z+?1&;Nrh3}O07r+8)pPI)To-HEfxmio<}cw8o;(~W;2JES(z1hffbl-TIf5h%pDWZ ztg;1uP+fk6&Q2L286Y+7Z})(Z@sS~JBbGnVp4Nk9Cv3s!!3Vby*>`yRJg}VLgvGWn zx<=1TO!YKI^F*O}8z#Ig>%&hM2+t*a!_waHkf}>p>yrwBp90hiT9T<9%P?vyO-;)( zkS#4cSdBW5X{rX0&dSh7ID`pqNab~4(8BZMI8+yjlck=cnJ>{y_T$v8wbn z$+&jper7XXwB>i?ReTaH08}B*$C$+|j>fFdt$YD*&aBT4E{;m{5j;)_mptbrn7xJ$ zgn&{6VjBYnu@V+@EkKqqHHT4zxNQWEZKE$h>eF|O`gL>TOk%@YlM)mo5 zcK%mr;3P6+c65$5yz2*DIeP3{m>H1?pFVPS^n)kP9{5Y&pE`0Ld~7srPa%*^qle=h z8OPdwju_r%OqOI)jQsGao)uf#V(d+p@ZYlA+G+%Z^T6!oMrh_4nw_wtLXu1D4Svj+ z@iC%zCLlTr6-yLrGDEV3+-){wy{NtcntGMM*9kmMfLu7uVQxE8PtX9FW@;nJ$xzhu znK;Sm(up$$lTLm~7;2U#IwupOv^+8=I-?Qw6s?CA)c`D*U4f0m)XLi2?9Z*-0ITFI zc5q=-7heurrxZ~DLsEuT#Ls~|S`mjv+2~n`6`hOCBMYT5L>ZF<>lF6kWvC#(T|mD$ zq8=x)1d`nYt7nf;2MQWe^H5PKv6FmXMGIO**6P8nI}sHSmr+J|E!k?T{qNpDaUw6* zP+z>i_6`+3LdLQYOR<<0L@j^676t9SnrQEZy4LB}EPmLKi>Zl_42Sq84w~uD8S+-& z!f3rbqWmHTQah`I)BK*VLTu-ZNvLlVuO|l#)pv+VA+v^OXq1wp`YwTM1W56ULWuev zfzz(1E@SLIT10l2DVk!6V~!)nWJ>1D{>-w6c*Xuom7nCF)QnUkH3cpN4lplnd%#as zRFve%7_=e$8VLDTV6<4zf~xs8)DHFLj@d6>Mc)RV*Pi)IFNfMIH*fri_v~IC6q_sOCv6^>0h}Q z^&YK7>141jEtxI;eZtNxSkMgH-S`hgTmv{=unfp)X=vKfsntme14%^WdndNdAp=ub z_en0CYVn?lo8V}lZV#wh5^E^2y1lBcPu&EoQ_NZ@1p)6uqr9M0# z&QE%Aizh~ug@@AuP*l$S2~A78Y8+e|l{Ve19#vldAN%58u`j(Dw=`n1!>#K|LGSF( z*qOQufVlb=;o9R!rEZBf*LSRXF7u^3gDI}geho6=$n9sptaoM=TxY=MkN7Ss0F+q; z4lu_N zzDMyZpA=>d^l?wrcfm`z3z|JAL`9M?ieN~D4lJjXl#*f?;yj8$eU5G+q2H38mpYz@ zt9=(GxR;ha@8hl?^eK7I6YYlQDe_8X6QCkQJx_pimzvEm9?8U{zcl~$UA=x*4pFL9 z6o0bVb(^qt0`CK)M~3#7kTDVK;t*U}i>n-m!c=_{g`R6yG0dCvKNxye80 z7XOUroKF{?nw#R4DLj0^2F(V|8PKW=<^>bM8Kpn};`Eo5Pc`>U@fEN?<(8Df=(O50 zX+MKomA+HqYMxCk-S{5> CU>%O37_Kv({y6=-5l4m5>QS|3t z@0g$UZU6lU-^acoy(;}IcsF@BqwM|OajEg-s_EV0EhFXw-rM|BUQ+tj(X!rc{sU=g z$GhEILEZ;F{fxGJ;x%&1>18W6A80nh*nF%Rgt7g^df4oB>+Lvuc4NKm$C)#J7-ygE zB62w!7as7RZ#Mi>z3ysrEuMJ1=lSjQ#d)mv)ct9=^kiMt+wEc6x@PkacR&X2hj=Q!r=Yv<#iujEy}yEp4bS{PbB*nH~enHe}KK69lSlPdK>E_U0!_b zsi#lXF%2K-sSCd1WIbN*);W4iM)JD8j$sa7pJKHEhha6lwHJc+YuEAjnvF$U&b+4M z=QV6>?KKmm!PmhbV;Ol0PcQ?rrbXJbRv()_(r2_U7#9sJzpf4;=~Zpnh>evyV!f%3 z;$6tkPMz2Y)QNVlQE#7UH_x3|zZ{%KGKlIoSq7HF}-(dk>@LAG2FCnmW!_svxg1wt;8tZxV0!;R${VL-I9B zD%Z&CdBeNg%lH}3^i418XFbceyqur&ZR`yjzj^%TeP^}gIo`xeSxBt6dnIoY++Og? ze$gv|H7a3jY38%RAs5 ze9452IgplkyLZT|O3Xow$XPW~33CYYV3K`@JgLeISgHIt4EZr9c?a_22;>K538hW% zR_`|S=qTQ1IU}eqksHU-_SmzV*ngSR6W$%t_RW~_JH5M5^0-$7gWZ#gho^)iVza&x z_8y8037a$$0W!Is%86PUQ4R)zVh~gGMB)ryC)zz#AH0+YPu08-dowOKgPNzB z&-<#DFlB?gW$?!^nNaq{HxP*QNULd)0jftCP$SBMW<185d8B*VhOQPOeLH(q>vI2S zH7hbD_Hbl&jWD;xI6Kl>b}NriUh+U~A`GPoT4<6MC0s&2s8-bZ0ft(WsFSTvN@_Wr zlG>&s3vm^xb6S1WL(f`TZN}5ipe!cVJpGX3em}cBt8T-ux}9W&!DY0z$Z zkSMhmnq9B=LR=buSDX-a0wlJQNshCL1a{70?_a0|%}qZp$b@$iQ56^0RDJz?y32!j zDtYbrou0Z}TkrMS@lvOLiR%;kmmuE#ZY}I#jn=~r<;RsIds2H?KiBpHb%33?lby(8 z^}7Cs>UHCy--WDh_%&Yyq#c`q-(KB4T6R4dGRw=jcZ8U_nRVK=n&_Dt--44MmSgG3 zKf|zc&UHgKuUi@Gx`|)Munb89wM|L8X60?ZWeP~&oOzrYD&gprLDjLPd9_wEh9LAxa>%rg}8SkE+=slD4A^)gcgOC=7sB}{Yj~}EOcrsm-1X?dzvv< zxd*Hv&)KJ!Gs#pBnJu0RHrCgBD)c?5acoNP4m5nQxlbmk>h=7f+UZPA%{+2JzhB?OZ*iAOrJX03vZESK5t0y`7~$ zKtkqF?*wtD8_KVTemavWFfi1;%w7wAYnUuN89XMQEFLS=x3qo^AsbH~ zkAr6-(jU;C{gZwnDzuE?W|j>z2%FSYTYA5Unv1B}xL^eLFf}TA`l8k^F;7%NXfm4I zwytXZGIEwtVk)w?bJ5gQ9mbO0uOOe@pN^(QTQ#HUR$hT_J19|!%2CDBJ`Ka9UydeF zYI=L(s>YJtEZ^589OaQxVQl2E=BToe#2ed%s4VfB?LwFdi%})2B)yo|wu~)Z(h99o zeL>aixjOjzXW5hzmSCIwEkkWz}OirTAbJVOM_C;Qc~*Tp>Qs8 zusZv&=KJv+h)lF&_YX$_m75$BHME!Gujs&1@bu*9rmcH08y_v{bRsmndn$#M-I@K73EfKV4mV>fjlB7 zHXnXc`K!%KRYE_&n`$$tHa3*M^@Og>LB*wtA=_M zKO&TGP~ens0iOksPeOq|o9pXfRSf>a8kW_d{CZVC4bhbW)tbfc6Re0*iCa2;kuFW18J?i9mG7oc^|`3ePrG+8jK>fJRzPAmi{1JoL4 zfD}>)s}g%Uo(xidLXE};?EA^VboJH{Iv1!__PJ1d^8N>EXC8avZ0(T;)ZK_ur%9+& z;%o?XdOo%icLT=P>#hk0#Thgl&tj4<_ycEzI3ciDuGQG+1REVi8#kG$n-V?s6zeQB z>GntwLp+(ht@mJXgH;1nXP$}xEwHhFX}FHl5|@r)u((BuOotq(8e8;PLowJm2fqT) zI=RE0P9{5@(FBsml^bc8s3}zA>g)o_mR#(AGFap58i9X+H$Ssa?HAZy96ommdjW9i6){bQoqzmMV z$&%JM(l_ZkdBLshG&d|%9`2a5nz)oK*+Bj876FsHws#*j2fqhW&JJLkNx){L5Keah z%F4Q}JBCdPU}qu5)(ePr35YNe=l;xl-QA`;& zjEx!q_Kc)%<0rPm))>oY#+cD(P$T24YZVi9$v=62-LjMTLdGC6+Vu}5{;yrI5Nlpb zO@M)&G%nW0Amm|;e~5?{{)~tSdQk9?Afvn>p;!7&(8NgqSx8p{G?&!hJF33vzumrX|&a!(72c-O^!P+7oL{e^HU0tg4{ z&{}YN6|~;BqJ?nAgVXAQjxghAFX+G_OP|-34eYT%>;XN!a3LG~Tfvc-8BYh&LfiuU zdD0)u$PIBAML54JUl#d{@wjjysY&otuD_rCPSDuJYtcURcoFEu0Nsba?e|Q{wh@M&=^VK_HI>nX3c9BbKyZdB*4;B#0&H@&5Q z8ivbBVy%0n)})o}6{Bx;XTqrz7g;GTIuurcTn+&j4f;b|WO;c(PdVo5=fcA{XaEZG zux6NfVjbeNBi;m1P*qS+HOc`8GU+RsE?2w9TJ+C;-&oooJ{H#yltM?6Qy{51KyAklp^*H{k_UXsT;sl4w2F)8qVUA z&L?0BGXSK1C6julGpdRVg7@I1egH^6+$wd+ck^qgioGH+Thwf8d#^+KnB9H(kX|t=^ zEJQ_-n5_a6#4-0i#yvqIrtEBuWm6v*9R%W4P&7GJkK}oh7f3FVh@apxLo{mRybrDD zD|iXiCL?!X#Niznq5d3`A`&77QTj>WsBhHANj^dHNs>>Ie41p7M09$YNENLBbtGUU z(fve;_5hB!6C-+5{dt!C49UwRp9OJe$9;_nzzAYT93#oNJ?5(jfK2>@UNSdvdG{2> zdd;mQ+I2)f2Z9*fY2ww9Xi?jLfgKrVjd(`^C!G6vvdK4)v20SY?fj$Z4+TlLA5<&D zRJu0QFjOyM=_nNS3zb>GKPcElXvHRZ+8Vz<~qiHHJ!fN8wV{mJ4s8WRtNai0?%Vu0c^S_?t$6 zPRR_O;D2HKP^Cjhg9oAH8D8ckh&&NdHe&KG>n$CMViCXa;uN&cY7sVQXhv{F!oW8B zPMD=V1XX$!;&Y;32rZc5xotafL?E1pVx>|mMx|9l6h@&np}rUv#6@Akw0=lbONQol z@K_R#Cd5fmYQc35hxx~%d^G8q3)(y31c6X2wcx}+oq4G*@0k)m$bAXwcQPzPeCDD_ zc%o2lD`^=jXeUm}A_vOm;O;NArlhT34WS%8oH1R|w`Uj&XK0}{Tbt`1P5KNluv34T zPK&grGHV+4jy=fZ-%bvM(|#r`eFy5!Vco4c)^PFWa2^V)(ptc5FTM;n?iI72{fzb* zebw-6tc(pO&C=C$b@E!1aD+2&1K><;@8i5nJ>L)a!@Qn?63L=BP&`*GK#~g7h?A(1 z8-(*vVbJ%*Ba3IVa7Du*^~-WxII@@aP!~AsKc-eU7Fg6Zybsi77JfIqraZG|M^}Z3&ZZ|@3 zG3GXdxz(8Ux~#gzC#ywrfO zU+XEH!+5o1+Hjnaq`@c_G2K(a@&Pr+#t)F3AUR1Qdgw1Qv`f?W{B{u4dEVE5=mE^C2xC^^xCo#NeG5$J}kG=II)rMJdPleu&5!+Xk z$A}i>q_~CA9Iid^>&sHbQ^VOAH}U=od&<=x(J;wkeTtm$*O6jciFPT07b?&pWoQl> za~480${f7_n+kkF^9}mNhJIlQ>JT1JAr>hX^@-tRgrBb4)DAD&c%xy+d^QZTq>h~! z>M7&J#Hd6KZx&4Obi@=KGvutFnf6;~m1I0|esJI#G`QIYf820XE2r=T-vzs$R?-|px8PJg0{GpKEBO{7_An#9w>^WgDDY6gc2 z5UE!)5OFv&Q}6XeMMZg0QAH>p*l)17N?kbPplF!qA(XS=nJis3;C;rqnmQYqEvPe5 zDH={{ClM-lc?##yvV*Y9LRea`FHDHk-K{#U2?)~yJgBtnC`PS4#3*NEC`Lyyu83$8 zBFOkc=Ay;h7dzrK6)bKJoF`r~liOq`T^LK?_+``|IB3Jtu;e(ew6AX_2Vbe3?O*a6 z@RucrLVLk-NVt50qUgxU55QcYeiOASLMB`WDV}e79`LSg=RlErmkSAC{v5M?k>qcX z$YJw7gyJ#}L+XtXC+~8A6{s~vXM^)~h>>?Ob8@DZu+iAIm$si{mL23O=xs+H>_V>W zk~_cJuG3$MbH{Oa*xkrUQ`=fHKRk6-Uq^Io_2dH2uDQn9^XNQ2$60>{ z*&W+7?2K~_@DFFh_BGjr$=2hRy=Iy8ePBy+v%-5~W-Wk03osCLWSd!USUK6`*Q{)s z)7)GfHL%9oA5(CYnN)D(v|t7B4g8C}1@D9&8aR`H6}5mfVsNRPF}F4&w6(hSO6TsetKEk zE@D3tt{nM{wyFb|VH4E@)_JBE&@#)JrQP`c7ck$7~I7JrSHQ|E3Ft%jS z0NAi8CfSdG7`{$le;GnD#AI&ib2t*(ZIq`bF5Jo9rB`)#+S}-K;IbvV51qP4QQMAq z_$C>ZLfFy_qjcT6zIp2?Q^A9|+HCv0xv`gMNH3IFu9H?zd-pju&>|7Uu*wi^<7MOY z>E(GjKX*&SWjjaYYL=N+NPdpwVGoE|Ag1z_pW;U(DGrGdJgq#~&A6 zXQYQB>15IVE;9*Y;8D$BUB1c{Bp7pG6Vl~N)L@AV*ua|zi#Kfu4eNRVHYu*ETr+bJ zEuKL zo_ZOlH)nXIgU4^Y=HYiKTD=u$w0ELhT!2~%n;ri_rFtqZE8umg2VTDv27u>k2RrYu z^6#MNA7CPW%uNFW9E023Kicu#^(5Ymzk}RkkOWoi+)b$BWv_7nS{M`)6b|YiBU9RU z_1g%mzm-HtFF(NuHnK}wNT!$zC;f$spt7sKH>gJLpfMjfMFhKh3eS&;8}{z%Rb1M= zp_8%A&BF{+F3D&w#HVz0 z*cI}+`VPn+qZjI*Au!I3BQ?AwH>!WcN^WS~A!heyHJ#G`U z;I;p(Cb}6;4CrP@^m0C2r1Rhv%y76b%8Ty*Fx)r$y&UvE^fSHT^hP7B!H*6Ep2Xp_ zevnEua1ibUV0VNLsqL^Tdfb_TFB=*dzrf-^>f49IBe>02;PwiSBJVNG2Xr^R^n==> z>@Wl*fIK(B_nie|FSc&pK7JLwsNDo~ueEN8oRiwc&PUKQ6aTg{xM^bFiN$#V^TK~0 z#gooG&Tt~0(@e^$6yP)}e%bU&U&(L|1qDo=-u%uf@i_AGKzQD-c8CwFxN%rL_4EVx zSH=AptW?jQhiL26FIU@s-K&PZ>IRSS#Za%p{|Wgn_r8N_b9JPeR2&jNHyfy=SB1_U zP48CaR-?3fTNU!5dVK7<d3KTu8FB~>W3>cdYyy;6N-wR*X?QN?WocCX8s035FN zR@0of$q1Vs^24JD7!)web8G7xspmy*-#0hKPy8G^oAi>oc80f7{U{n?zv}G(6CwBR zWiFm05*XOm&sFQtA@CyL#s;Q2Z4m1sJ|0X1Zh@`{VpAW)rW_Lyl%BM@rny_GumoW7 z9wi{w1zo|q90gaD4g+vyoC~0ZXd)y+L!{JkycSVmYHC66ise0SYNZ!i-`doA=weX1ukj?E6dAj3RSuA;Dho#@zH`9$5~ESc?Z@{adu$~iVZ>$=mu_Q| zIy)|aQC4va1S;HDi6`j^_VRg>$4M@c2+GR(xU6#7%>u{Z(+ZwuxMr`5tG!P$=&a&0 z3s-vmvK(WmFR-8>L;;60c#U&|8^X5oaeF&}vq?@j-ow0ik_d*Ru_+o?PI2N%GQZ3Y zpa#57Vq~g!v%;+G;n?JrWHCSo@Q)x2UO2h!!jUG|2}VvOcC1iYx#23g^E;Ua7nkG!XDLU4xv26#nIK+u^$OthlYLY^hs!}l3S>ytydKZV7Z9t4F^{)_C zdnoESSA&1IU8~*4&Bq<>WP$yrN3ma;1N-+@q(6k;`|x{u`4-;ureRjA!H1h(L`Jw! zt6hYHGD%UioYe$LfrPe_Vmrzq`THQTd5-+@e!f3KB8P7uVu)ZstPhoCi2ES6&dS$D z39DL+7Nh2K41JM=NJ4xiF7Ae``naKnEA3P^YMv?IAo+(RzeBRa-QVUb;gTYHiF1#B z_!Pf#Q2&8Ze@H?xHbO*ly$x~4@C3gIVrxbo>-!Tp3~}?xOS#2FD?` zA~hG@u-;eDOYd;#(15gbu2-x|wvx573wr5E#ePxjjs0=X@52*(9AxmFkEi>YmWi_t zy7){ti?4m)yuZukL$`Wox_nEz8?0K~C^2?is&Q2Ko|@tlU2^ubR4c1L`_3hPtp|ymJiv#gHYa zO~Ngy9~e)+V!Uz`>3i1Sw{EEa@zMHkNPT8BRoax+Y0A;R)Pf(XW#h?v#;@@^8VBFc zSnw#x_c&N=^EA$T{vAfYZ0(-6ae*&RLbyI|Zj|*GY~JiR|p(!|R905BDKZvTdAKCo&$p@QwY~om?S1 zBidm!)_4&qc5;3Uy@7xB&)~#_A7I$aL~X95QocdSHuz(h!cmyzMFc)hS0XNIz^yNi zM!FfvEQrHJXT-^_9mZuP&TNgxT4`*CJA9Zom&TeSu`sE~RH#4a;mbQLT>iAI*IC?j zEp%}=L5}=^hZ4r$Ywc{sX?L@Y9u{r)if~z7e7Um@|536s*|%Pq@%4yvFcsh_^j_qe zP%s$wOlAh>In=}AiAkwrI++{iJ1sq?Pcx{6^G%c`hliZmh*HM*4gHsB`im~r)qg_< zs8rhe_@+*+;Y95M`m%PzhYe7P_c!4S=4_b`OmD}=yqcZ z+LvGLibfP>|1Q(nS@k^qL@ly&>NS#`8IzE(F{SJy?95#V*jgEy#e@m# zs5GI_h7GxIRGu&{$vG<0r%*FAcp#sWHS292$Eq1bDjFSo4)u?a@Kp|#P{Vk!ELS*> zTz4{xDY1S@eT+q)A-NCa2HzF;-zhwDj*XLR_=zL^f}VUP5`imVZ1Y4L@w|pNPY)J3 zL_G}x{`l_<@3bL{AVgN;6{P+(b`KBOVoq0#*||+VCJHPhu{#+&!7-5ZLIO@3a3LYN zjF1f}0m?=wjY!qhJ?Z0(-2M-eM`uw0v;ZUMPpN;>yHm>}(MBJ`Vs zd7R0W_2g#OuONDC^g1HgMOo2X5Qrd2VVpuionhEOxZvSn5aG-`BN;~H~` zq?1(yWK9WrGIBr9?3>W5IK}kv8TJ))7M+VTjr9$%o6Hq!lnFbFrF%PH{ul(Q!%2>f zu(zC5KVq5`8TKT(Ss@%MM27NJZ%A#4#yOU7StK(8n0TKDoBs<6r4^)ereP!SC5uQ1@zcEwK;8X5d8b7!GN! z#AsGUmYbGx;zdeN8G9=E#&D8FX$H7Q_QD9YR!DLlO5kbE1F9Ouk2y_lq@KCi@pp}R$lB6mSj=>c9z zzP!dpl>gj7zLIpyJIU#B%|6ySwG*8lJBbD5Iuf7E$pN+@s_9=d^j#8BJsS-DJ(3>D z6C`gV`F)VMlyb*r!B*4tk*Zhv;ngG1!5E#fY&LYX4e6QHu(=!$Vlc>j6+pDU#7t?0b z(eRx8?O(UWSxx&JRi>XhDzBqte*xf{r*WOLNROGvV#70H(=+3WSJA26h%C=i->UkW zksa5(TI_gET=(j6!)wG%uNk+zmKtA)X5v|IR^e7O7tee13a>^B@uIh=a64Lxm%Zh9 z#amJJTC^Ikd20%HqEqo{@3gMn(s-RW9%{VtfO%(XM$O=hyeS&ICC>60VTyBW8lN3~ z=eV)u@Oi%Qu;QKP#vQG_G+4gwZiv226Zhj(?h5I4qae#VpX1D0+bqnRVK*<#x5F$i z?00*4m?l9~RNn9RBC*l7iu#S9dso~@ldW*OXxtQgVRu}szmswijk;Pt4P+2SllsgZ zOym7X$k71j?ex9tCEnJHnM~xq3=*EkekKG*)6%9)1Ktg?+|NalrPAlv8akdqtuKWi z({g@qw2mJpVIBriIKXUHH17m4wk+T6=e>SjEZq?w^+nPZw`AJyy(98~2YCQu<$2yn zw!=hFRh{9zev$~;C6-J-E2zAVl6|0Sn$SEQO2WAA88rq$nEVQ_2$Nf)!mGmKwy5%& zu(^X$7d74xj;KR8O?+$kwji__&bF5MET4mbn!KY#WqK`$bm^hy&4}6kxdW}XTwAHF z*4FqsKLuK+#XLWQ?^)2C5evc=_7+>yAn0@a{KLuv-CN|YSW>i2{tSQiq3JDy_67bN zuoZq$td8gKOH=q|g|7)4v_MbIxB2tDjajEgJ7G8}UVfFofK#lA(`y<=UIad&iTb4z z^~w}+Zna%Z}At^uFrA?1-({SA1b{Z+`4{DL*4vtkh|g&(e39VLru6!0bFQu z;BJ#x-E_-MO5N_JQe+*+#j{bEK|fF5EE=W4bW7cxnJCVvVKy6Us6T3Ha;4tmsuQK% zAR;Z@EldShILt(bo7fRo`sc?Yznk)+9;GrE&ui`O`CYOapXc{_qL>XcpUZGhNWV0} z!jdBI%cM&lX#5avqufBrJ_Q)+L(SKQ3{W2$fNW?28p8^pIkW&P!z!RPv;nI_2N2kf zEiVIBxW3D>%eOVo9_XLyc@6X!=zzu!#&YA=M8V-2U|qq6f=vZmT!V1T4GGQH1{dB3 z_sb)UWfb?b+`Sv@370tIUhTA*JcF_F96)sw zvc;lNa&RP>QB5eb9ruYnk>puX+wS#Cez)s|nTcrY8!wKiP;U4$OruG5-n$3!CNFiaU0u(@TwLh|-Q8eYWb1oT6t9=kTrX8{B->8!USa#Ca`<%g>_q^p zY3V25dEI18r`JTWA6qt!aP+}UX%L+;l`hF({19%TC^exy=MdlyRAmQ>vSUyP`?aZS zIf=QlV?EFjPDqr-hG%X^X$qgWFs;R#IFU$ryIPzrH$B}clD!a-BL<&KSsn?mfu$9n ziRFTSrpt@eLOUwKWnhIvPP%m5KSDjDSr#+ZiLEbZt-;*rtViR%c3Orx>a;B&IF+`q zA86&NKGQd#b}rfr7;C`Gl_YWe#7^Wo+HRp_M6RVz7CxHx`7hD)_doxMo`1Zi6u+o$ zp+CR&a^Y-3DR%t~(WY1?g^NX;%6oo44-tU^3Y$f%i}vi}pyy}M6H(Ml6;@85TLJO$ zQw1_P)gmf%svZ44O!mG7^R~9gGqS;rJ0W7)J-750nHvY6_(`7v=0}+uWG>;f7qz$S zMfd8skv38`cGNk*BvM7>C{q;L)ly_Hp+|m)z;_9}OaoiTNtg8_swbFRoaxDQD~yEN zU9cB~QLu?xxrN&xn32lbHB-0s!G)2?|KA+>490hfN8^WZ6Gdr42T182bJBd`pVIyZ z{IR|{oM?6ba7r<*#R&pe1Nvr|ylxEY8PM&*Csck9d z(l>pGQ5mJ3WXYy(9WJuL>a>ne5B{2}lX0s9(n-l|)wau3pQ6-7^W#9qe!o}h{m*G_ z#V0mc9Ibh5KrvTJ#Lv@GPs5~`KhcyTd1@WCb<^IxF70OgkRS~dwarZ+&{=b+%Ld6| zUmxm<;|y&~E~_>+WRo^Sx~k0>8k06tUL;v1$gb{Jh83>$b-A8bxItlS6o%e>fAEUp zoSXK9ROSW?kit#U+$AFsiaGVLh>Cso5KdugcvlQI{;teScQS}_e< z%r;aEI5ZC@PU&cFu>xcFRklO9hRS072wBK6heCM+IT$6BKLQ~p%PT{$udipd+Zy6O zMDqYHX2?|A0<{edg|8{xRQP#?V-#fyT>rg!-^$t0;tcr$)3szNX9_1rWpQ(jL}u{{ zzBJ0h*TA=mFU)Y?DCry+x(30rT_)cRq17br{R$cIj&;E9SBF)&J9`7S5#@I#C(h(? z&X>MSFN{of<#inML_a(lWgmqRi9|N3|1|taCEnM8pP;0u7be{q z3JaQoE5!)`%bS=rK*_EGSat)ht)X5vit5&rmgO)T?evnI;r=N09hyh>d2Ck*(vrf# zoA5d3@dZaDQJ2>UkaLjV20&Vdr+tp{j3kJK?-!0w=fZ6Zc*FNU>Ic!Nr|SDW?fSkX z9g#Hx->VRrXQFR_$Ye%!FuQlqM$BLFU`$+BYWGd9Ajg!2NIZ9#*u|wJ9uYlwT@%!um zd9T8Z1F{hcR_3lDPs8EqL-TjY)~N+w;}5vneX1ubLzX?8!`7%aG~}hB0asbMZx9t^ z@(6i5aKNx@OC^4^ik)lwHICp%PBzEAV5g3PKL|*hY}i_DaQUst?O9!|rm{YFD(mYE zVkP;`bbkof8%YuHdoIPl(H-2ytw@nl6X=={ahLKXm#4VDk)w`-ynEL@aS!XLkt(xt zpC44*y{zL7Si3X0GqvXQWy@7pk4@nsSi9kt`_v86a!=W$XC~cl1V>XHMTf#UT3%gE zyn306s(an%WOEzsrrf}{s84MOrjV&kjZW{iwvjqa(fS$9XyC^}9>zjNlfwBRi25b& z&8t0}F^>L-P3-q8^pu{?gn5*)XILVR`$^bUsf3RhPT|74_6=rE2xWPYi&8>OH4kx1 z{czc+iPh&bWBHh#F*YFH{OHnB{Ju@zQ z<=FD9xZ+h}+p~2ww(3>oSVw-%s210~dff0DanozYQ{GfO?M=rs-i#byj%MRy-Z5FW zqPh6EcU+b$(R_TuJ0Z(9>3wep=)^wnuskf+;$YC z>CzvFuq&-{Zr%*rS#I77(=0DP==UO?m)3Ze+xL4}m~?}vW#zRSLHjYkk#sl0&Afh- zZ-?z+t@cU6cvN&XKMh0>MWg!E8m6&+#6>Z{xtFY7SLK$TPo+HbMbKqQ?5CVFG&${l zZ^!RvVU(hFd|e~~YX@oOXS|yx!e`hlI+|fO%=~sg%`i0(XgP=ezTkdL+wptFK71M% z22r>R>_*;L3t}8l-0x?-ewH6!<4^j$+vayf((ir3vw#Iz0A%XCx4N5Qms3@qakby= za$2fJhghYlDec7yQ~z<;YZE~ypA)Ejgg5;R+{!g@sYV>?m8wRyR5jVRSecv5;$>Fh z7PEPURk_U^yft2Bb?)#Q?ze$&72hVWgC~rQc{a^vaMum?F8BeX(MRnjJI3bF-ULU? zKi9k|KD|G4ps^Fcx4>uFBEBbqcZw4S*&D{92EIAPPCqyIbZ?HG;l~R+ud%b>oq6^i zbJ;nRPO$TQVK|3fm?&SAWncy_;FI%h_Bwk5vrek@aaQ?b-)5Jv+eLnAQNzgdD35TW ze&I;{Vo@LAChTZCEq8sHwVs!;>oZl2y~$rgPaWrfi@l9?&$25xscKnb%V=>?JE!)@ z-eFhKa$d^C`?>XFDG|XkF6jCp@o=-pLFV$G^L9Vuse3ta!BH7lDe1b&Mho5Cm7M0H zAt32mavT@WEeOgGQ<%Dtw4@yd8E5X(Fnf%lY1rM2xSZiW0G2{1F;?)jmvqx`9o-v= zkm%wd>}Gf|Nv`?Bhi;oh(S^_y+Z%R`wnKZP4Y_x_7@Tr!SmI)ka1pNKK+Fxh8(a|Q zzwPpub{uYYG34-($0uHr$X0SQ=x538ysiYUt#rfGNbrh=>2#<8{Gz47r1}_EohWGs z5ov*~+?4Rd#73q3`Z3QQCoHc;i3o=Cnp@kxB!$niogUApA%#qY+g$jHq`4({))(D2 zS&iXC09peZurhD}QR>*@5@4C> zTSj{6uQX=t>A%#oD)1S=100>opvp?j{1wf1z^NL(H3{nyHY98^4LZ8KBEW;%?ge5A zb&Mfr#87hX<6xV+L>>2?rQ5BNIETUFJb?=UttoMm%BKjNCUAzpYXr^`a0$pcG+Z19 z7+#!^S}m_DS{8Ry)VNfD!#=*vyIGo7H+y|W`Bp7AQyy)66OA?*Ld0 zOF#OX)lI|D>1Rr4e__~GjYc{8?vzpiOG8X~RT`jPBC@stgu{o*WxSGcev1MwSnKE= zDD*8ue87yX)G=A(uwCOg)VQttd62zO6-Ko2hO!n`U4nT9YtIW}88M7Vj});?CTjiFwCSs&(cGn`A5- zg4*u0AlI*YXCK^Oz45WXa^uE5|N6uGcW?Tuw{F~Dz3Hz#SiN=qlQnNqb*=qm<%WN2 zb@l$L|MB&eoA>Un+}SM?l52`d7gv(ZP5V6}lB=ZIB^kWtwoXXS^DNbxn^H85B+VBP znQhVn!-vYmcarCtDBy0jeVzD&?)Gzi1^am&?KzZaz?AFrxvsW&cz5C>42$umgs`cP zmL5;coplJ}mY>2`%IC+Flg?PU8<6eGn{B$0r$Nt8A)7p}DYYT5qfPQLd`Qt+c;(*z z8HEg-yKm6maf;HuDY=~5%A=BXp#9W%R(=Mh3*}i}(4JK?k`!I4Qz<70<}*91LoFL~ z+Mu$a4eXQJp8l+gwo*}cvIZ2dzGtAdF>qvw=9W55@v&TcO0Fo~BS_46U)6!LK@PJK zJO=4M70E^z!T6Is?eSpCRh;DZQ>Y{u)v-yG)>v5^H9QV}9>)DxT1ppBw;!*=^Tr(V z!^w8vz*o}Fl~ku`a5&mQ*IkEkk4PoLP8UuEp|~zK(`zGy)H1%VYtoaTiyLlh0=2T& zG=ZL1->X8fA%b?{;mEUPNftzM$HmFB1c&E`5gulUmv=%3C5CY&!@%)y*LUDjjgd&5 z(wY_D29pR<2I2z(#MfS}r!K*VmGWxoP8Uie>EfIM;h`q35tyTab0qkR7sq57+oik& zH!3gTTD-b+BI2NnOBCL;fOgpCJ}B-}N17X(ZLiW!5X2zRs3}?c0j)!vEN&9GMJrBi z2+pN~QpEA5CAlVBYQ!~#{_&=d?#^q1E97v+0jf^qRQ03!<@AsiG%eT;!YF{JSPbrA zODU-)XVx(t6MCm)N?l_>@4)oEFkjS54h)_>dUr1r+UI}6E8YOkmkgnaMk&}t80A4_ zWP(Vv;YoE$d*n%x4`g=8k0OCKA=2bO!E=(v&+Is4{z#FD?*k2JO>4>P9ABKcgtpvL zu7|ix16l+mcQnapOk6kZW~iYhuERPopOJc2-Gg#%{s}Wu(yZhQn7Z}C0lDV%-GxcT zJUQVd{anIiNeWsJhl4grYxo-bAVJ5J7M|?C*0F{*Ssk|`59u{Htd(eUpA$c-b2ZvQ z51wlg%c=vD4HDlaaGk)BlQo_OBKG?|WdeSS5vc`W!|=e&%oV3OG$@}@Z02W)Qs`kf zpFPr)AVzME+uBKQr%iW1d@3l62tX{z^k?Qk7j-)DzCMr+4l~eZhG0&&6~7k$u_z9Gzha+MG5pL*sE9}a3289LYnqDD-=l5Q!~nv>yDDzQS3lV{0ru$ zE`V7vbr^R1#;|GFCJCGQwK=x7ZgAU(=_|XW( zLz->j3X!Hcas%Qg==&Yslom6qlSEhL5$O$6xAB@UOk*Sjhjx@JYamoE=OsNimajNV^oW|8W!Ir-W9j z%3|SKVZ3POaBcq;YyS*w^H^K_9SXa%g*KR&{E}9qO-ilipr8(ZFjCYbpA+1wjOMF! zk_a#rX;I~IBGMWYSFT*ze)2bT(VL8UX z8#UukM&0?hg{$@{tAu$+<~UaUi`grVZB!ieIKbAF*qkq|lfW%QV)IMtYZiU8SjoaH z*HJc4f1-w)Vni-QhGkK7;=-ys=%mrDl&|Qbs3bH^(zW>Z3-vE9efEZR2$&{ z*4w~}@Zl9$Nky8>-ZI6bfx)V)Fgfi44h2cuQB^T->)BN z(wM7Nx3TKWeGTAf;+@(<=#TGo`uEw)9))XF*ekrP`tGH>ql~n4vL|9tcOnK|+Kpw5 zyfoQQ3B^e8fbGDK9_6)Nr1T1Z9YHmvjUt9LBZ2{zAbjZ}SdW7YJx6kLOLC-)_uR|7 zWp_JWa(9i^((c;Cnv)4QSEiy6pTO^OL&QH6#2?)tE%%B|dS{~`=cLWOGNPRXSu*OJq0YJDuaMDo(p`J=y|Bk4{9iHt??bjPVy`k#82(u@!YrnN z4bGFy8p>?PNU$lG!QDcx+HmVX3bdSS_ zH1o)ekzO81U4&#^W#~OBvtudsZ?8*zfAxJyY414Q;xJPk^n&#;3bP$=YNDA;kRY!Y z`Pe7;c(d&!hQGq$M~)`g6mZLtN9grFW;`T%k-BuGmm;W5ZR(2PRQ{O2w*Ybr$uDI3 zC#;S%Owz7M>n3g3%u6Ea)hKnkr1HATgh|_Y#9oqZ^p44)Dxa_1q(q1Y^_+i2&(eth zf}$;{&azQZi+UHt}AN1IJEBvzClHRL?_ z;qpH~WxUCZdkeAyypF$1mz2-0;zgBgIMl7krLyQUvF^>t+2oWh6^^C!20TZ$D`Jt9 zE3 None: + self.cache_config = cache_config + self.model_config = model_config + self.parallel_config = parallel_config + + self.head_size = model_config.get_head_size() + self.num_layers = model_config.get_num_layers(parallel_config) + self.num_heads = model_config.get_num_kv_heads(parallel_config) + + self.block_size = cache_config.block_size + self.num_gpu_blocks = cache_config.num_gpu_blocks + self.num_cpu_blocks = cache_config.num_cpu_blocks + + # Skip initializing CUDA stream and buffer for Neuron backend. + if is_neuron(): + return + + if cache_config.cache_dtype == "auto": + self.dtype = model_config.dtype + else: + self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + + # Initialize the cache. + self.gpu_cache = self.allocate_gpu_cache() + self.cpu_cache = self.allocate_cpu_cache() + + # Initialize the stream for caching operations. + self.cache_stream = torch.cuda.Stream() + assert self.cache_stream != torch.cuda.current_stream() + # Initialize the events for stream synchronization. + self.events = [torch.cuda.Event() for _ in range(self.num_layers)] + + def get_key_block_shape(self) -> Tuple[int, int, int, int]: + element_size = torch.tensor([], dtype=self.dtype).element_size() + x = 16 // element_size + use_v2 = self.head_size == 128 and self.block_size == 16 and enable_infer_paged_attn is None + if use_v2: + return ( + self.num_heads, + self.block_size, + self.head_size, + ) + else: + return ( + self.num_heads, + self.head_size // x, + self.block_size, + x, + ) + + def get_value_block_shape(self) -> Tuple[int, int, int]: + use_v2 = self.head_size == 128 and self.block_size == 16 and enable_infer_paged_attn is None + if use_v2: + return ( + self.num_heads, + self.block_size, + self.head_size, + ) + else: + return ( + self.num_heads, + self.head_size, + self.block_size, + ) + + # TODO align + """ + def get_key_block_shape(self) -> Tuple[int, int, int, int]: + element_size = torch.tensor([], dtype=self.dtype).element_size() + x = 16 // element_size + return ( + self.num_heads, + self.head_size // x, + self.block_size, + x, + ) + + def get_value_block_shape(self) -> Tuple[int, int, int]: + return ( + self.num_heads, + self.head_size, + self.block_size, + ) + """ + def allocate_gpu_cache(self) -> List[KVCache]: + gpu_cache: List[KVCache] = [] + key_block_shape = self.get_key_block_shape() + value_block_shape = self.get_value_block_shape() + for _ in range(self.num_layers): + key_blocks = torch.zeros( + size=(self.num_gpu_blocks, *key_block_shape), + dtype=self.dtype, + device="cuda", + ) + value_blocks = torch.zeros( + size=(self.num_gpu_blocks, *value_block_shape), + dtype=self.dtype, + device="cuda", + ) + gpu_cache.append((key_blocks, value_blocks)) + return gpu_cache + + def allocate_cpu_cache(self) -> List[KVCache]: + cpu_cache: List[KVCache] = [] + key_block_shape = self.get_key_block_shape() + value_block_shape = self.get_value_block_shape() + pin_memory = not in_wsl() + if not pin_memory: + # Pinning memory in WSL is not supported. + # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications + logger.warning("Using 'pin_memory=False' as WSL is detected. " + "This may slow down the performance.") + for _ in range(self.num_layers): + key_blocks = torch.zeros( + size=(self.num_cpu_blocks, *key_block_shape), + dtype=self.dtype, + pin_memory=pin_memory, + device="cpu", + ) + value_blocks = torch.zeros( + size=(self.num_cpu_blocks, *value_block_shape), + dtype=self.dtype, + pin_memory=pin_memory, + device="cpu", + ) + cpu_cache.append((key_blocks, value_blocks)) + return cpu_cache + + def _swap( + self, + src: List[KVCache], + dst: List[KVCache], + src_to_dst: Dict[int, int], + ) -> None: + from vllm._C import cache_ops + + with torch.cuda.stream(self.cache_stream): + for i in range(self.num_layers): + src_key_cache, src_value_cache = src[i] + dst_key_cache, dst_value_cache = dst[i] + # Copy the key blocks. + cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + # Copy the value blocks. + cache_ops.swap_blocks(src_value_cache, dst_value_cache, + src_to_dst) + event = self.events[i] + event.record(stream=self.cache_stream) + + def swap_in(self, src_to_dst: Dict[int, int]) -> None: + self._swap(self.cpu_cache, self.gpu_cache, src_to_dst) + + def swap_out(self, src_to_dst: Dict[int, int]) -> None: + self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) + + def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: + from vllm._C import cache_ops + + key_caches = [key_cache for key_cache, _ in self.gpu_cache] + value_caches = [value_cache for _, value_cache in self.gpu_cache] + # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. + cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) + + @staticmethod + def get_cache_block_size( + block_size: int, + cache_dtype: str, + model_config: ModelConfig, + parallel_config: ParallelConfig, + ) -> int: + head_size = model_config.get_head_size() + num_heads = model_config.get_num_kv_heads(parallel_config) + num_layers = model_config.get_num_layers(parallel_config) + + key_cache_block = block_size * num_heads * head_size + value_cache_block = key_cache_block + total = num_layers * (key_cache_block + value_cache_block) + if cache_dtype == "auto": + dtype = model_config.dtype + else: + dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] + dtype_size = _get_dtype_size(dtype) + return dtype_size * total + + +def _get_dtype_size(dtype: torch.dtype) -> int: + return torch.tensor([], dtype=dtype).element_size() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py new file mode 100644 index 0000000..3885993 --- /dev/null +++ b/vllm/worker/model_runner.py @@ -0,0 +1,1223 @@ +import contextlib +import time +from typing import Dict, List, Optional, Tuple, Set, Union + +import numpy as np +import torch +import torch.nn as nn + +from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, + SchedulerConfig) +from vllm.logger import init_logger +from vllm.model_executor import get_model, InputMetadata, SamplingMetadata +from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils.communication_op import ( + broadcast_tensor_dict) +from vllm.model_executor.parallel_utils.parallel_state import ( + with_cupy_nccl_for_all_reduce) +from vllm.model_executor.parallel_utils import custom_all_reduce +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.utils import in_wsl + +logger = init_logger(__name__) + +KVCache = Tuple[torch.Tensor, torch.Tensor] +_PAD_SLOT_ID = -1 +LORA_WARMUP_RANK = 8 +# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. +# NOTE: _get_graph_batch_size needs to be updated if this list is changed. +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)] + + +class ModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.lora_config = lora_config + self.is_driver_worker = False + # TODO align + """ + self.is_driver_worker = is_driver_worker + """ + + # model_config can be None in tests/samplers/test_sampler.py. + # FIXME(woosuk): This is a hack to make the tests work. Refactor this. + self.sliding_window = (model_config.get_sliding_window() + if model_config is not None else None) + self.device_config = (device_config + if device_config is not None else DeviceConfig()) + self.device = self.device_config.device + + self.model = None + self.block_size = None # Set after initial profiling. + self.lora_manager = None + + self.graph_runners: Dict[int, CUDAGraphRunner] = {} + self.graph_memory_pool = None # Set during graph capture. + + self.max_context_len_to_capture = ( + self.model_config.max_context_len_to_capture + if self.model_config is not None else 0) + # When using CUDA graph, the input block tables must be padded to + # max_context_len_to_capture. However, creating the block table in + # Python can be expensive. To optimize this, we cache the block table + # in numpy and only copy the actual input content at every iteration. + # The shape of the cached block table will be + # (max batch size to capture, max context len to capture / block size). + self.graph_block_tables = None # Set after initial profiling. + # cache in_wsl result + self.in_wsl = in_wsl() + self.kv_cache_dtype = kv_cache_dtype + + # Set enforce_eager to True for Neuron backend, to avoid capturing graph + if self.device_config.is_neuron: + self.model_config.enforce_eager = True + + def load_model(self) -> None: + self.model = get_model(self.model_config, + self.device_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + + vocab_size = self.model.config.vocab_size + + if self.lora_config: + assert hasattr( + self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, "Model does not support LoRA" + assert hasattr( + self.model, + "embedding_modules"), "Model does not have embedding_modules" + assert hasattr(self.model, "embedding_padding_modules" + ), "Model does not have embedding_padding_modules" + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens + + self.scheduler_config.max_paddings, vocab_size, + self.lora_config, self.device, self.model.embedding_modules, + self.model.embedding_padding_modules) + self.model = self.lora_manager.create_lora_manager(self.model) + + def set_block_size(self, block_size: int) -> None: + self.block_size = block_size + + max_num_blocks = (self.max_context_len_to_capture + block_size - + 1) // block_size + self.graph_block_tables = np.zeros( + (max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32) + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int], + List[int], List[int], Set[LoRARequest]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[List[int]] = [] + lora_index_mapping: List[int] = [] + lora_prompt_mapping: List[int] = [] + lora_requests: Set[LoRARequest] = set() + + prompt_lens: List[int] = [] + context_lens: List[int] = [] + subquery_lens: List[int] = [] + prefix_block_tables: List[List[int]] = [] + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + prompt_len = len(prompt_tokens) + prompt_lens.append(prompt_len) + prefix_len = 0 + prefix = seq_group_metadata.prefix + if prefix is not None and prefix.computed: + prefix_len = prefix.get_length() + prompt_tokens = prompt_tokens[prefix_len:] + prefix_block_tables.append(prefix.get_block_numbers()) + else: + prefix_block_tables.append([]) + # actual prompt lens + context_lens.append(prefix_len) + subquery_lens.append(prompt_len - prefix_len) + + input_tokens.extend(prompt_tokens) + input_positions.extend(list(range(prefix_len, prefix_len + len(prompt_tokens)))) + + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping.append([lora_id] * (prompt_len - prefix_len)) + lora_prompt_mapping.extend( + [lora_id] * + (prompt_len - prefix_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.append([_PAD_SLOT_ID] * prompt_len) + continue + + # Compute the slot mapping. + slot_mapping.append([]) + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, prompt_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + assert prefix_len == 0, ( + "Prefix caching is currently not supported with " + "sliding window attention") + start_idx = max(0, prompt_len - self.sliding_window) + for i in range(prompt_len): + if i < start_idx: + slot_mapping[-1].append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping[-1].append(slot) + + max_prompt_len = max(subquery_lens) + slot_mappings = [] + lora_index_mappints = [] + for mapping in slot_mapping: + slot_mappings.extend(mapping) + for mapping in lora_index_mapping: + lora_index_mappints.extend(mapping) + + input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) + input_positions = torch.tensor(input_positions, dtype=torch.long, device=self.device) + slot_mapping = torch.tensor(slot_mappings, dtype=torch.int, device=self.device) + + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + # Prepare prefix block tables + max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) + block_tables = _make_tensor_with_pad( + prefix_block_tables, + max_len=max_prompt_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + prompt_lens_tensor = torch.tensor(prompt_lens, + dtype=torch.int, + device=self.device) + start_loc_tensor = prompt_lens_tensor.cumsum(dim=-1) + + input_metadata = InputMetadata( + prompt_lens=prompt_lens, + is_prompt=True, + slot_mapping=slot_mapping, + max_seq_len=max_prompt_len, + start_loc=start_loc_tensor, + max_context_len=None, + context_lens=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=False, + kv_cache_dtype=self.kv_cache_dtype, + ) + return (input_tokens, input_positions, input_metadata, prompt_lens, + subquery_lens, lora_index_mapping, lora_prompt_mapping, + lora_requests) + + # TODO align + """ + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int], + List[int], List[int], Set[LoRARequest]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + lora_index_mapping: List[int] = [] + lora_prompt_mapping: List[int] = [] + lora_requests: Set[LoRARequest] = set() + + prompt_lens: List[int] = [] + context_lens: List[int] = [] + subquery_lens: List[int] = [] + prefix_block_tables: List[List[int]] = [] + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + prompt_len = len(prompt_tokens) + prompt_lens.append(prompt_len) + prefix_len = 0 + prefix = seq_group_metadata.prefix + if prefix is not None and prefix.computed: + prefix_len = prefix.get_length() + prompt_tokens = prompt_tokens[prefix_len:] + prefix_block_tables.append(prefix.get_block_numbers()) + else: + prefix_block_tables.append([]) + # actual prompt lens + context_lens.append(prefix_len) + subquery_lens.append(prompt_len - prefix_len) + + input_tokens.append(prompt_tokens) + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.append( + list(range(prefix_len, prefix_len + len(prompt_tokens)))) + + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping.append([lora_id] * (prompt_len - prefix_len)) + lora_prompt_mapping.extend( + [lora_id] * + (prompt_len - prefix_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.append([_PAD_SLOT_ID] * prompt_len) + continue + + # Compute the slot mapping. + slot_mapping.append([]) + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, prompt_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + assert prefix_len == 0, ( + "Prefix caching is currently not supported with " + "sliding window attention") + start_idx = max(0, prompt_len - self.sliding_window) + for i in range(prefix_len, prompt_len): + if i < start_idx: + slot_mapping[-1].append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping[-1].append(slot) + + max_prompt_len = max(subquery_lens) + input_tokens = _make_tensor_with_pad(input_tokens, + max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + input_positions = _make_tensor_with_pad(input_positions, + max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + slot_mapping = _make_tensor_with_pad(slot_mapping, + max_prompt_len, + pad=_PAD_SLOT_ID, + dtype=torch.long, + device=self.device) + lora_index_mapping = [ + _pad_to_max(mapping, max_prompt_len, pad=0) + for mapping in lora_index_mapping + ] + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + # Prepare prefix block tables + max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) + block_tables = _make_tensor_with_pad( + prefix_block_tables, + max_len=max_prompt_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + start_loc_tensor = torch.arange(0, + len(prompt_lens) * max_prompt_len, + max_prompt_len, + dtype=torch.long, + device=self.device) + prompt_lens_tensor = torch.tensor(prompt_lens, + dtype=torch.long, + device=self.device) + + input_metadata = InputMetadata( + is_prompt=True, + slot_mapping=slot_mapping, + prompt_lens=prompt_lens_tensor, + max_seq_len=max_prompt_len, + start_loc=start_loc_tensor, + max_context_len=None, + context_lens=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=False, + kv_cache_dtype=self.kv_cache_dtype, + ) + return (input_tokens, input_positions, input_metadata, prompt_lens, + subquery_lens, lora_index_mapping, lora_prompt_mapping, + lora_requests) + """ + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + context_lens: List[int] = [] + block_tables: List[List[int]] = [] + lora_index_mapping: List[int] = [] + lora_prompt_mapping: List[int] = [] + lora_requests: Set[LoRARequest] = set() + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + + seq_ids = list(seq_group_metadata.seq_data.keys()) + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.extend([generation_token]) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.extend([position]) + + context_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + context_lens.append(context_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.extend([slot]) + lora_index_mapping.append([lora_id]) + lora_prompt_mapping.append(lora_id) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + batch_size = len(input_tokens) + max_context_len = max(context_lens) + + use_captured_graph = ( + not self.model_config.enforce_eager + and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] + and max_context_len <= self.max_context_len_to_capture) + + if use_captured_graph: + # Pad the input tokens, positions, and slot mapping to match the + # batch size of the captured graph. + graph_batch_size = _get_graph_batch_size(batch_size) + assert graph_batch_size >= batch_size + input_tokens.extend([0] * (graph_batch_size - batch_size)) + input_positions.extend([0] * (graph_batch_size - batch_size)) + slot_mapping.extend([_PAD_SLOT_ID] * (graph_batch_size - batch_size)) + context_lens.extend([1] * (graph_batch_size - batch_size)) + for _ in range(graph_batch_size - batch_size): + block_tables.append([]) + batch_size = graph_batch_size + + # When using CUDA graph, we don't need to make the tensors on the GPU + # because they will be eventually copied to the designated GPU buffer. + input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) + input_positions = torch.tensor(input_positions, dtype=torch.long, device=self.device) + slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device=self.device) + context_lens = torch.tensor(context_lens,dtype=torch.int,device=self.device) + + if use_captured_graph: + # The shape of graph_block_tables is + # [max batch size, max context len // block size]. + input_block_tables = self.graph_block_tables[:batch_size] + for i, block_table in enumerate(block_tables): + if block_table: + input_block_tables[i, :len(block_table)] = block_table + block_tables = torch.tensor(input_block_tables, device=self.device) + else: + max_block_table_len = max([len(t) for t in block_tables]) + block_tables = _make_tensor_with_pad(block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device) + + lora_index_mapping = [ + _pad_to_max(mapping, 1, pad=0) for mapping in lora_index_mapping + ] + + input_metadata = InputMetadata( + prompt_lens=[], + is_prompt=False, + slot_mapping=slot_mapping, + max_seq_len=None, + start_loc=None, + max_context_len=max_context_len, + context_lens=context_lens, + block_tables=block_tables, + use_cuda_graph=use_captured_graph, + kv_cache_dtype=self.kv_cache_dtype, + ) + return input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping, lora_requests + + # TODO align + """ + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int], + Set[LoRARequest]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + context_lens: List[int] = [] + block_tables: List[List[int]] = [] + lora_index_mapping: List[int] = [] + lora_prompt_mapping: List[int] = [] + lora_requests: Set[LoRARequest] = set() + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + + seq_ids = list(seq_group_metadata.seq_data.keys()) + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append([position]) + + context_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + context_lens.append(context_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append([slot]) + lora_index_mapping.append([lora_id]) + lora_prompt_mapping.append(lora_id) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + batch_size = len(input_tokens) + max_context_len = max(context_lens) + use_captured_graph = ( + not self.model_config.enforce_eager + and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] + and max_context_len <= self.max_context_len_to_capture) + if use_captured_graph: + # Pad the input tokens, positions, and slot mapping to match the + # batch size of the captured graph. + graph_batch_size = _get_graph_batch_size(batch_size) + assert graph_batch_size >= batch_size + for _ in range(graph_batch_size - batch_size): + input_tokens.append([]) + input_positions.append([]) + slot_mapping.append([]) + context_lens.append(1) + block_tables.append([]) + batch_size = graph_batch_size + + input_tokens = _make_tensor_with_pad(input_tokens, + max_len=1, + pad=0, + dtype=torch.long, + device=self.device) + input_positions = _make_tensor_with_pad(input_positions, + max_len=1, + pad=0, + dtype=torch.long, + device=self.device) + slot_mapping = _make_tensor_with_pad(slot_mapping, + max_len=1, + pad=_PAD_SLOT_ID, + dtype=torch.long, + device=self.device) + context_lens = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + + if use_captured_graph: + # The shape of graph_block_tables is + # [max batch size, max context len // block size]. + input_block_tables = self.graph_block_tables[:batch_size] + for i, block_table in enumerate(block_tables): + if block_table: + input_block_tables[i, :len(block_table)] = block_table + block_tables = torch.tensor(input_block_tables, device=self.device) + else: + max_block_table_len = max( + len(block_table) for block_table in block_tables) + block_tables = _make_tensor_with_pad( + block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + lora_index_mapping = [ + _pad_to_max(mapping, 1, pad=0) for mapping in lora_index_mapping + ] + + input_metadata = InputMetadata( + is_prompt=False, + slot_mapping=slot_mapping, + prompt_lens=None, + max_seq_len=None, + start_loc=None, + max_context_len=max_context_len, + context_lens=context_lens, + block_tables=block_tables, + use_cuda_graph=use_captured_graph, + kv_cache_dtype=self.kv_cache_dtype, + ) + return (input_tokens, input_positions, input_metadata, + lora_index_mapping, lora_prompt_mapping, lora_requests) + """ + + def _prepare_sample( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], + subquery_lens: Optional[List[int]], + ) -> SamplingMetadata: + seq_groups: List[Tuple[List[int], SamplingParams]] = [] + selected_token_indices: List[int] = [] + generators: List[torch.Generator] = [] + selected_token_start_idx = 0 + categorized_sample_indices = {t: [] for t in SamplingType} + categorized_sample_indices_start_idx = 0 + pin_memory = not self.in_wsl and not self.device_config.is_neuron + + max_subquery_len = max(subquery_lens) if subquery_lens else 1 + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_ids = list(seq_group_metadata.seq_data.keys()) + sampling_params = seq_group_metadata.sampling_params + seq_groups.append((seq_ids, sampling_params)) + + if seq_group_metadata.is_prompt: + assert len(seq_ids) == 1 + assert subquery_lens is not None + subquery_len = subquery_lens[i] + if sampling_params.prompt_logprobs is not None: + # NOTE: prompt token positions do not need sample, skip + categorized_sample_indices_start_idx += subquery_len - 1 + + categorized_sample_indices[ + sampling_params.sampling_type].append( + categorized_sample_indices_start_idx) + categorized_sample_indices_start_idx += 1 + + if sampling_params.prompt_logprobs is not None: + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + subquery_len - 1)) + selected_token_indices.append(selected_token_start_idx + + subquery_len - 1) + selected_token_start_idx += subquery_len + # TODO align + """ + selected_token_start_idx += max_subquery_len + """ + if sampling_params.seed is not None: + seq_group_metadata.state.generator = torch.Generator( + device="cuda").manual_seed(sampling_params.seed) + else: + num_seqs = len(seq_ids) + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + num_seqs)) + selected_token_start_idx += num_seqs + + categorized_sample_indices[ + sampling_params.sampling_type].extend( + range(categorized_sample_indices_start_idx, + categorized_sample_indices_start_idx + num_seqs)) + categorized_sample_indices_start_idx += num_seqs + + if sampling_params.seed is not None: + generators.append(seq_group_metadata.state.generator) + + selected_token_indices = _async_h2d(selected_token_indices, + dtype=torch.long, + target_device=self.device, + pin_memory=pin_memory) + categorized_sample_indices = { + t: _async_h2d(seq_ids, + dtype=torch.long, + target_device=self.device, + pin_memory=pin_memory) + for t, seq_ids in categorized_sample_indices.items() + } + + # TODO align + """ + categorized_sample_indices = { + t: _async_h2d(seq_ids, + dtype=torch.int, + target_device=self.device, + pin_memory=pin_memory) + for t, seq_ids in categorized_sample_indices.items() + } + """ + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + sampling_metadata = SamplingMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + selected_token_indices=selected_token_indices, + categorized_sample_indices=categorized_sample_indices, + generators=generators, + ) + return sampling_metadata + + # TODO align + """ + def prepare_input_tensors( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata, + Set[int], LoRAMapping]: + if self.is_driver_worker: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, input_metadata, prompt_lens, + subquery_lens, lora_index_mapping, lora_prompt_mapping, + lora_requests) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, input_metadata, + lora_index_mapping, lora_prompt_mapping, + lora_requests) = self._prepare_decode(seq_group_metadata_list) + prompt_lens = [] + subquery_lens = None + sampling_metadata = self._prepare_sample(seq_group_metadata_list, + prompt_lens, + subquery_lens) + + if self.lora_config: + flat_lora_index_mapping = [ + item for sublist in lora_index_mapping for item in sublist + ] + lora_mapping = LoRAMapping( + flat_lora_index_mapping, + lora_prompt_mapping, + ) + else: + lora_mapping = None + + # Broadcast the metadata. + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "is_prompt": input_metadata.is_prompt, + "slot_mapping": input_metadata.slot_mapping, + "prompt_lens": input_metadata.prompt_lens, + "max_seq_len": input_metadata.max_seq_len, + "start_loc": input_metadata.start_loc, + "max_context_len": input_metadata.max_context_len, + "context_lens": input_metadata.context_lens, + "block_tables": input_metadata.block_tables, + "use_cuda_graph": input_metadata.use_cuda_graph, + "kv_cache_dtype": input_metadata.kv_cache_dtype, + "selected_token_indices": + sampling_metadata.selected_token_indices, + "lora_requests": lora_requests, + "lora_mapping": lora_mapping, + } + broadcast_tensor_dict(metadata_dict, src=0) + else: + metadata_dict = broadcast_tensor_dict(src=0) + input_tokens = metadata_dict["input_tokens"] + input_positions = metadata_dict["input_positions"] + lora_mapping = metadata_dict["lora_mapping"] + lora_requests = metadata_dict["lora_requests"] + input_metadata = InputMetadata( + is_prompt=metadata_dict["is_prompt"], + slot_mapping=metadata_dict["slot_mapping"], + prompt_lens=metadata_dict["prompt_lens"], + max_seq_len=metadata_dict["max_seq_len"], + start_loc=metadata_dict["start_loc"], + max_context_len=metadata_dict["max_context_len"], + context_lens=metadata_dict["context_lens"], + block_tables=metadata_dict["block_tables"], + use_cuda_graph=metadata_dict["use_cuda_graph"], + kv_cache_dtype=metadata_dict["kv_cache_dtype"], + ) + sampling_metadata = SamplingMetadata( + seq_groups=None, + seq_data=None, + prompt_lens=None, + selected_token_indices=metadata_dict["selected_token_indices"], + categorized_sample_indices=None, + generators=None, + perform_sampling=False, + ) + + return (input_tokens, input_positions, input_metadata, + sampling_metadata, lora_requests, lora_mapping) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> Optional[SamplerOutput]: + (input_tokens, input_positions, input_metadata, sampling_metadata, + lora_requests, + lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) + + if self.lora_config: + self.set_active_loras(lora_requests, lora_mapping) + + # Execute the model. + if input_metadata.use_cuda_graph: + graph_batch_size = input_tokens.shape[0] + model_executable = self.graph_runners[graph_batch_size] + else: + model_executable = self.model + hidden_states = model_executable( + input_ids=input_tokens, + positions=input_positions, + kv_caches=kv_caches, + input_metadata=input_metadata, + ) + + # Sample the next token. + output = self.model.sample( + hidden_states=hidden_states, + sampling_metadata=sampling_metadata, + ) + return output + """ + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + ) -> SamplerOutput: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, input_metadata, prompt_lens, + subquery_lens, lora_index_mapping, lora_prompt_mapping, + lora_requests) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, input_metadata, + lora_index_mapping, lora_prompt_mapping, + lora_requests) = self._prepare_decode(seq_group_metadata_list) + prompt_lens = [] + subquery_lens = None + sampling_metadata = self._prepare_sample(seq_group_metadata_list, + prompt_lens, + subquery_lens) + + if self.lora_config: + flat_lora_index_mapping = [ + item for sublist in lora_index_mapping for item in sublist + ] + lora_mapping = LoRAMapping( + flat_lora_index_mapping, + lora_prompt_mapping, + ) + else: + lora_mapping = None + + if self.lora_config: + self.set_active_loras(lora_requests, lora_mapping) + + # Execute the model. + if input_metadata.use_cuda_graph: + graph_batch_size = input_tokens.shape[0] + model_executable = self.graph_runners[graph_batch_size] + else: + model_executable = self.model + hidden_states = model_executable( + input_ids=input_tokens, + positions=input_positions, + kv_caches=kv_caches, + input_metadata=input_metadata, + ) + + sampling_metadata = self._prepare_sample(seq_group_metadata_list, + prompt_lens, + subquery_lens) + + # Sample the next token. + output = self.model.sample( + hidden_states=hidden_states, + sampling_metadata=sampling_metadata, + ) + return output + + @torch.inference_mode() + def profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + vocab_size = self.model_config.get_vocab_size() + sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests = [] + dummy_lora_requests_per_seq = [] + if self.lora_config: + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_local_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_data = SequenceData([0] * seq_len) + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [(None, None)] * num_layers + self.execute_model(seqs, kv_caches) + torch.cuda.synchronize() + return + + def remove_all_loras(self) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_all_loras() + + def set_active_loras(self, lora_requests: List[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_loras(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_loras() + + @torch.inference_mode() + def capture_model(self, kv_caches: List[KVCache]) -> None: + # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never + # deleted before the CUDA graphs. + self.cupy_nccl_backend = cupy_utils.get_nccl_backend() + + assert not self.model_config.enforce_eager + logger.info("Capturing the model for CUDA graphs. This may lead to " + "unexpected consequences if the model is not static. To " + "run the model in eager mode, set 'enforce_eager=True' or " + "use '--enforce-eager' in the CLI.") + logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " + "If you are running out of memory, consider decreasing " + "`gpu_memory_utilization` or enforcing eager mode. " + "You can also reduce the `max_num_seqs` as needed " + "to decrease memory usage.") + start_time = time.perf_counter() + + # Prepare dummy inputs. These will be reused for all batch sizes. + max_batch_size = max(_BATCH_SIZES_TO_CAPTURE) + input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda() + input_positions = torch.zeros(max_batch_size, 1, + dtype=torch.long).cuda() + slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda() + slot_mapping.fill_(_PAD_SLOT_ID) + context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() + block_tables = torch.from_numpy(self.graph_block_tables).cuda() + + graph_batch_size = _get_graph_batch_size( + self.scheduler_config.max_num_seqs) + batch_size_capture_list = [ + bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size + ] + + # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce + # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use + # either custom all-reduce kernel or CuPy NCCL. When not using CUDA + # graph, we use either custom all-reduce kernel or PyTorch NCCL. + # We always prioritize using custom all-reduce kernel but fall back + # to PyTorch or CuPy NCCL if it is disabled or not supported. + with custom_all_reduce.capture(): + # NOTE: Capturing the largest batch size first may help reduce the + # memory usage of CUDA graph. + for batch_size in reversed(batch_size_capture_list): + # Create dummy input_metadata. + input_metadata = InputMetadata( + is_prompt=False, + slot_mapping=slot_mapping[:batch_size], + prompt_lens=None, + max_seq_len=None, + start_loc=None, + max_context_len=self.max_context_len_to_capture, + context_lens=context_lens[:batch_size], + block_tables=block_tables[:batch_size], + use_cuda_graph=True, + kv_cache_dtype=self.kv_cache_dtype, + ) + + if self.lora_config: + lora_mapping = LoRAMapping( + [0] * batch_size, + [0] * batch_size, + ) + self.set_active_loras(set(), lora_mapping) + + graph_runner = CUDAGraphRunner(self.model) + graph_runner.capture( + input_tokens[:batch_size], + input_positions[:batch_size], + kv_caches, + input_metadata, + memory_pool=self.graph_memory_pool, + ) + self.graph_memory_pool = graph_runner.graph.pool() + self.graph_runners[batch_size] = graph_runner + + end_time = time.perf_counter() + elapsed_time = end_time - start_time + # This usually takes < 10 seconds. + logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.") + + def __del__(self) -> None: + # Delete the CUDA graphs before deleting the CuPy NCCL communicator. + # NOTE(woosuk): This is necessary because otherwise deadlocks can + # happen. + # FIXME(woosuk): This is a bit hacky. Find a more robust solution. + self.graph_runners.clear() + self.cupy_nccl_backend = None + + +class CUDAGraphRunner: + + def __init__(self, model: nn.Module): + self.model = model + self.graph = None + self.input_buffers: Dict[str, torch.Tensor] = {} + self.output_buffers: Dict[str, torch.Tensor] = {} + + def capture( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + memory_pool, + ) -> None: + assert self.graph is None + # Run the model once without capturing the graph. + # This is to make sure that the captured graph does not include the + # kernel launches for initial benchmarking (e.g., Triton autotune). + with _maybe_cupy_nccl(): + self.model( + input_ids, + positions, + kv_caches, + input_metadata, + ) + torch.cuda.synchronize() + + # Capture the graph. + # NOTE(woosuk): Python 3.8 does not support multi-line with statements. + # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement + self.graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(self.graph, pool=memory_pool): # noqa: SIM117 + with _maybe_cupy_nccl(): + hidden_states = self.model( + input_ids, + positions, + kv_caches, + input_metadata, + ) + torch.cuda.synchronize() + + # Save the input and output buffers. + self.input_buffers = { + "input_ids": input_ids, + "positions": positions, + "kv_caches": kv_caches, + "slot_mapping": input_metadata.slot_mapping, + "context_lens": input_metadata.context_lens, + "block_tables": input_metadata.block_tables, + } + self.output_buffers = {"hidden_states": hidden_states} + return + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + input_metadata: InputMetadata, + ) -> torch.Tensor: + # KV caches are fixed tensors, so we don't need to copy them. + del kv_caches + + # Copy the input tensors to the input buffers. + self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) + self.input_buffers["positions"].copy_(positions, non_blocking=True) + self.input_buffers["slot_mapping"].copy_(input_metadata.slot_mapping, + non_blocking=True) + self.input_buffers["context_lens"].copy_(input_metadata.context_lens, + non_blocking=True) + self.input_buffers["block_tables"].copy_(input_metadata.block_tables, + non_blocking=True) + + # Run the graph. + self.graph.replay() + + # Return the output tensor. + return self.output_buffers["hidden_states"] + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +@contextlib.contextmanager +def _maybe_cupy_nccl(): + if cupy_utils.is_initialized() and not custom_all_reduce.is_initialized(): + with with_cupy_nccl_for_all_reduce(): + yield + else: + yield + + +def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]: + assert len(x) <= max_len + return x + [pad] * (max_len - len(x)) + + +def _make_tensor_with_pad( + x: List[List[int]], + max_len: int, + pad: int, + dtype: torch.dtype, + device: Optional[Union[str, torch.device]], +) -> torch.Tensor: + padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x] + return torch.tensor(padded_x, dtype=dtype, device=device) + + +def _get_graph_batch_size(batch_size: int) -> int: + if batch_size <= 2: + return batch_size + elif batch_size <= 4: + return 4 + else: + return (batch_size + 7) // 8 * 8 + + +def _async_h2d( + data: list, + dtype: torch.dtype, + target_device: Union[str, torch.device], + pin_memory: bool, +) -> torch.Tensor: + t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu") + return t.to(device=target_device, non_blocking=True) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py new file mode 100644 index 0000000..3229a21 --- /dev/null +++ b/vllm/worker/neuron_worker.py @@ -0,0 +1,191 @@ +"""A Neuron worker class.""" +from typing import Dict, List, Optional, Tuple + +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.model_executor import set_random_seed +from vllm.model_executor.parallel_utils.communication_op import ( + broadcast_tensor_dict) +from vllm.model_executor.parallel_utils.parallel_state import ( + ensure_model_parallel_initialized) +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.model_runner import ModelRunner + + +class Worker: + """A worker class that executes the model on a group of neuron cores. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + ) -> None: + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.model_runner = ModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + lora_config=self.lora_config, + is_driver_worker=is_driver_worker) + # Uninitialized cache engine. Will be initialized by + # self.init_cache_engine(). + self.cache_config = None + self.cache_engine = None + self.cache_events = None + self.gpu_cache = None + + def init_model(self) -> None: + # Initialize the distributed environment. + _init_distributed_environment(self.parallel_config, + self.rank, + self.distributed_init_method, + distributed_backend="gloo") + + # Initialize the model. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + @torch.inference_mode() + def profile_num_available_blocks( + self, + block_size: int = 128, + gpu_memory_utilization: float = 0.9, + cpu_swap_space: int = 0, + cache_dtype: str = "float16", + ) -> Tuple[int, int]: + """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" + num_gpu_blocks = self.scheduler_config.max_num_seqs + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig) -> None: + self.cache_config = cache_config + self.cache_engine = CacheEngine(self.cache_config, self.model_config, + self.parallel_config) + self.model_runner.set_block_size(self.cache_engine.block_size) + + def warm_up_model(self) -> None: + # Warm up is maintained in transformers-neuronx + pass + + def cache_swap( + self, + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + # Issue cache operations. + issued_cache_op = False + if blocks_to_swap_in: + self.cache_engine.swap_in(blocks_to_swap_in) + issued_cache_op = True + if blocks_to_swap_out: + self.cache_engine.swap_out(blocks_to_swap_out) + issued_cache_op = True + if blocks_to_copy: + self.cache_engine.copy(blocks_to_copy) + issued_cache_op = True + + cache_events = self.cache_events if issued_cache_op else None + + # Wait for cache operations to finish. + if cache_events is not None: + raise NotImplementedError( + "cache operations are not implemented for neuron backend.") + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, + blocks_to_swap_in: Optional[Dict[int, int]] = None, + blocks_to_swap_out: Optional[Dict[int, int]] = None, + blocks_to_copy: Optional[Dict[int, List[int]]] = None, + ) -> Optional[SamplerOutput]: + if self.is_driver_worker: + assert seq_group_metadata_list is not None + num_seq_groups = len(seq_group_metadata_list) + assert blocks_to_swap_in is not None + assert blocks_to_swap_out is not None + assert blocks_to_copy is not None + data = { + "num_seq_groups": num_seq_groups, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) + else: + data = broadcast_tensor_dict(src=0) + num_seq_groups = data["num_seq_groups"] + blocks_to_swap_in = data["blocks_to_swap_in"] + blocks_to_swap_out = data["blocks_to_swap_out"] + blocks_to_copy = data["blocks_to_copy"] + + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return {} + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.gpu_cache) + return output + + +def _init_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + distributed_backend: Optional[str] = None, +) -> None: + """Initialize the distributed environment.""" + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch world " + "size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + distributed_backend = distributed_backend if distributed_backend else "nccl" + torch.distributed.init_process_group( + backend=distributed_backend, + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) + + # A small all_reduce for warmup. + torch.distributed.all_reduce(torch.zeros(1)) + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py new file mode 100644 index 0000000..bdf3a25 --- /dev/null +++ b/vllm/worker/worker.py @@ -0,0 +1,354 @@ +"""A GPU worker class.""" +import gc +import os +from typing import Dict, List, Tuple, Set, Optional + +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.model_executor import set_random_seed +from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils.communication_op import ( + broadcast_tensor_dict) +from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar +from vllm.model_executor.parallel_utils.parallel_state import ( + ensure_model_parallel_initialized) +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.model_runner import ModelRunner +from vllm.lora.request import LoRARequest +from vllm.utils import is_hip + + +class Worker: + """A worker class that executes (a partition of) the model on a GPU. + + Each worker is associated with a single GPU. The worker is responsible for + maintaining the KV cache and executing the model on the GPU. In case of + distributed inference, each worker is assigned a partition of the model. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + ) -> None: + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.model_runner = ModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=False) + # TODO align + """ + self.model_runner = ModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=is_driver_worker) + """ + # Uninitialized cache engine. Will be initialized by + # self.init_cache_engine(). + self.cache_config = None + self.cache_engine = None + self.cache_events = None + self.gpu_cache = None + + def init_model(self, cupy_port: Optional[int] = None) -> None: + if self.device_config.device.type == "cuda": + # torch.distributed.all_reduce does not free the input tensor until + # the synchronization point. This causes the memory usage to grow + # as the number of all_reduce calls increases. This env var disables + # this behavior. + # Related issue: + # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.device) + + _check_if_gpu_supports_dtype(self.model_config.dtype) + torch.cuda.empty_cache() + self.init_gpu_memory = torch.cuda.mem_get_info()[0] + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + init_distributed_environment(self.parallel_config, self.rank, + cupy_port, self.distributed_init_method) + # Initialize the model. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + @torch.inference_mode() + def profile_num_available_blocks( + self, + block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: int, + cache_dtype: str, + ) -> Tuple[int, int]: + """Profiles the peak memory usage of the model and returns the maximum + number of GPU and CPU cache blocks that can be allocated. + + Args: + block_size: The size of the cache block. + gpu_memory_utilization: The fraction of the total GPU memory to use. + cpu_swap_space: The size of the CPU swap space in bytes. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.cuda.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.cuda.synchronize() + free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory + + cache_block_size = CacheEngine.get_cache_block_size( + block_size, cache_dtype, self.model_config, self.parallel_config) + num_gpu_blocks = int( + (total_gpu_memory * gpu_memory_utilization - peak_memory) // + cache_block_size) + num_cpu_blocks = int(cpu_swap_space // cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() + gc.collect() + torch.cuda.empty_cache() + return num_gpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig) -> None: + self.cache_config = cache_config + self.cache_engine = CacheEngine(self.cache_config, self.model_config, + self.parallel_config) + self.cache_events = self.cache_engine.events + self.gpu_cache = self.cache_engine.gpu_cache + self.model_runner.set_block_size(self.cache_engine.block_size) + + def warm_up_model(self) -> None: + if not self.model_config.enforce_eager: + self.model_runner.capture_model(self.gpu_cache) + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + def cache_swap( + self, + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + # Issue cache operations. + issued_cache_op = False + if blocks_to_swap_in: + self.cache_engine.swap_in(blocks_to_swap_in) + issued_cache_op = True + if blocks_to_swap_out: + self.cache_engine.swap_out(blocks_to_swap_out) + issued_cache_op = True + if blocks_to_copy: + self.cache_engine.copy(blocks_to_copy) + issued_cache_op = True + + cache_events = self.cache_events if issued_cache_op else None + + # Wait for cache operations to finish. + # TODO(woosuk): Profile swapping overhead and optimize if needed. + if cache_events is not None: + for event in cache_events: + event.wait() + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, + blocks_to_swap_in: Optional[Dict[int, int]] = None, + blocks_to_swap_out: Optional[Dict[int, int]] = None, + blocks_to_copy: Optional[Dict[int, List[int]]] = None, + ) -> Optional[SamplerOutput]: + # Issue cache operations. + issued_cache_op = False + if blocks_to_swap_in: + self.cache_engine.swap_in(blocks_to_swap_in) + issued_cache_op = True + if blocks_to_swap_out: + self.cache_engine.swap_out(blocks_to_swap_out) + issued_cache_op = True + if blocks_to_copy: + self.cache_engine.copy(blocks_to_copy) + issued_cache_op = True + + cache_events = self.cache_events if issued_cache_op else None + + # Wait for cache operations to finish. + # TODO(woosuk): Profile swapping overhead and optimize if needed. + if cache_events is not None: + for event in cache_events: + event.wait() + # If there is no input, we don't need to execute the model. + if not seq_group_metadata_list: + return {} + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.gpu_cache) + return output + + # TODO align + """ + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, + blocks_to_swap_in: Optional[Dict[int, int]] = None, + blocks_to_swap_out: Optional[Dict[int, int]] = None, + blocks_to_copy: Optional[Dict[int, List[int]]] = None, + ) -> Optional[SamplerOutput]: + if self.is_driver_worker: + assert seq_group_metadata_list is not None + num_seq_groups = len(seq_group_metadata_list) + assert blocks_to_swap_in is not None + assert blocks_to_swap_out is not None + assert blocks_to_copy is not None + data = { + "num_seq_groups": num_seq_groups, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) + else: + data = broadcast_tensor_dict(src=0) + num_seq_groups = data["num_seq_groups"] + blocks_to_swap_in = data["blocks_to_swap_in"] + blocks_to_swap_out = data["blocks_to_swap_out"] + blocks_to_copy = data["blocks_to_copy"] + + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return {} + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.gpu_cache) + return output + """ + + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.model_runner.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.model_runner.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_runner.list_loras() + + +def init_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + cupy_port: Optional[int], + distributed_init_method: Optional[str] = None, +) -> None: + """Initialize the distributed environment.""" + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch world " + "size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + torch.distributed.init_process_group( + backend="nccl", + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) + + if cupy_utils.is_initialized(): + cupy_world_size = cupy_utils.get_world_size() + if cupy_world_size != parallel_config.world_size: + raise RuntimeError( + "cupy.distributed is already initialized but the cupy world " + "size does not match parallel_config.world_size " + f"({cupy_world_size} vs. {parallel_config.world_size}).") + elif (parallel_config.world_size > 1 and cupy_port is not None + and not is_hip()): + # NOTE(woosuk): We don't initialize CuPy process group when world size + # is 1. + # TODO(woosuk): Support multi-node connection. + cupy_utils.init_process_group( + world_size=parallel_config.world_size, + rank=rank, + host="localhost", + port=cupy_port, + ) + + # A small all_reduce for warmup. + torch.distributed.all_reduce(torch.zeros(1).cuda()) + if cupy_utils.is_initialized(): + cupy_utils.all_reduce(torch.zeros(1).cuda()) + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + # Initialize a custom fast all-reduce implementation. + if not parallel_config.disable_custom_all_reduce: + init_custom_ar() + + +def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): + # Check if the GPU supports the dtype. + if torch_dtype == torch.bfloat16: + return # avoid capability error + compute_capability = torch.cuda.get_device_capability() + if compute_capability[0] < 8: + gpu_name = torch.cuda.get_device_name() + raise ValueError( + "Bfloat16 is only supported on GPUs with compute capability " + f"of at least 8.0. Your {gpu_name} GPU has compute capability " + f"{compute_capability[0]}.{compute_capability[1]}. " + "You can use float16 instead by explicitly setting the" + "`dtype` flag in CLI, for example: --dtype=half.")